arize-phoenix 4.5.0__py3-none-any.whl → 4.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-4.5.0.dist-info → arize_phoenix-4.6.1.dist-info}/METADATA +16 -8
- {arize_phoenix-4.5.0.dist-info → arize_phoenix-4.6.1.dist-info}/RECORD +122 -58
- {arize_phoenix-4.5.0.dist-info → arize_phoenix-4.6.1.dist-info}/WHEEL +1 -1
- phoenix/__init__.py +0 -27
- phoenix/config.py +42 -7
- phoenix/core/model.py +25 -25
- phoenix/core/model_schema.py +64 -62
- phoenix/core/model_schema_adapter.py +27 -25
- phoenix/datetime_utils.py +4 -0
- phoenix/db/bulk_inserter.py +54 -14
- phoenix/db/insertion/dataset.py +237 -0
- phoenix/db/insertion/evaluation.py +10 -10
- phoenix/db/insertion/helpers.py +17 -14
- phoenix/db/insertion/span.py +3 -3
- phoenix/db/migrations/types.py +29 -0
- phoenix/db/migrations/versions/10460e46d750_datasets.py +291 -0
- phoenix/db/migrations/versions/cf03bd6bae1d_init.py +2 -28
- phoenix/db/models.py +236 -4
- phoenix/experiments/__init__.py +6 -0
- phoenix/experiments/evaluators/__init__.py +29 -0
- phoenix/experiments/evaluators/base.py +153 -0
- phoenix/experiments/evaluators/code_evaluators.py +99 -0
- phoenix/experiments/evaluators/llm_evaluators.py +244 -0
- phoenix/experiments/evaluators/utils.py +186 -0
- phoenix/experiments/functions.py +757 -0
- phoenix/experiments/tracing.py +85 -0
- phoenix/experiments/types.py +753 -0
- phoenix/experiments/utils.py +24 -0
- phoenix/inferences/fixtures.py +23 -23
- phoenix/inferences/inferences.py +7 -7
- phoenix/inferences/validation.py +1 -1
- phoenix/server/api/context.py +20 -0
- phoenix/server/api/dataloaders/__init__.py +20 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
- phoenix/server/api/dataloaders/dataset_example_revisions.py +100 -0
- phoenix/server/api/dataloaders/dataset_example_spans.py +43 -0
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +85 -0
- phoenix/server/api/dataloaders/experiment_error_rates.py +43 -0
- phoenix/server/api/dataloaders/experiment_run_counts.py +42 -0
- phoenix/server/api/dataloaders/experiment_sequence_number.py +49 -0
- phoenix/server/api/dataloaders/project_by_name.py +31 -0
- phoenix/server/api/dataloaders/span_descendants.py +2 -3
- phoenix/server/api/dataloaders/span_projects.py +33 -0
- phoenix/server/api/dataloaders/trace_row_ids.py +39 -0
- phoenix/server/api/helpers/dataset_helpers.py +179 -0
- phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
- phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
- phoenix/server/api/input_types/ClearProjectInput.py +15 -0
- phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
- phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
- phoenix/server/api/input_types/DatasetSort.py +17 -0
- phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
- phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
- phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
- phoenix/server/api/input_types/DeleteExperimentsInput.py +9 -0
- phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
- phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
- phoenix/server/api/mutations/__init__.py +13 -0
- phoenix/server/api/mutations/auth.py +11 -0
- phoenix/server/api/mutations/dataset_mutations.py +520 -0
- phoenix/server/api/mutations/experiment_mutations.py +65 -0
- phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +17 -14
- phoenix/server/api/mutations/project_mutations.py +47 -0
- phoenix/server/api/openapi/__init__.py +0 -0
- phoenix/server/api/openapi/main.py +6 -0
- phoenix/server/api/openapi/schema.py +16 -0
- phoenix/server/api/queries.py +503 -0
- phoenix/server/api/routers/v1/__init__.py +77 -2
- phoenix/server/api/routers/v1/dataset_examples.py +178 -0
- phoenix/server/api/routers/v1/datasets.py +965 -0
- phoenix/server/api/routers/v1/evaluations.py +8 -13
- phoenix/server/api/routers/v1/experiment_evaluations.py +143 -0
- phoenix/server/api/routers/v1/experiment_runs.py +220 -0
- phoenix/server/api/routers/v1/experiments.py +302 -0
- phoenix/server/api/routers/v1/spans.py +9 -5
- phoenix/server/api/routers/v1/traces.py +1 -4
- phoenix/server/api/schema.py +2 -303
- phoenix/server/api/types/AnnotatorKind.py +10 -0
- phoenix/server/api/types/Cluster.py +19 -19
- phoenix/server/api/types/CreateDatasetPayload.py +8 -0
- phoenix/server/api/types/Dataset.py +282 -63
- phoenix/server/api/types/DatasetExample.py +85 -0
- phoenix/server/api/types/DatasetExampleRevision.py +34 -0
- phoenix/server/api/types/DatasetVersion.py +14 -0
- phoenix/server/api/types/Dimension.py +30 -29
- phoenix/server/api/types/EmbeddingDimension.py +40 -34
- phoenix/server/api/types/Event.py +16 -16
- phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
- phoenix/server/api/types/Experiment.py +147 -0
- phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
- phoenix/server/api/types/ExperimentComparison.py +19 -0
- phoenix/server/api/types/ExperimentRun.py +91 -0
- phoenix/server/api/types/ExperimentRunAnnotation.py +57 -0
- phoenix/server/api/types/Inferences.py +80 -0
- phoenix/server/api/types/InferencesRole.py +23 -0
- phoenix/server/api/types/Model.py +43 -42
- phoenix/server/api/types/Project.py +26 -12
- phoenix/server/api/types/Span.py +79 -2
- phoenix/server/api/types/TimeSeries.py +6 -6
- phoenix/server/api/types/Trace.py +15 -4
- phoenix/server/api/types/UMAPPoints.py +1 -1
- phoenix/server/api/types/node.py +5 -111
- phoenix/server/api/types/pagination.py +10 -52
- phoenix/server/app.py +103 -49
- phoenix/server/main.py +49 -27
- phoenix/server/openapi/docs.py +3 -0
- phoenix/server/static/index.js +2300 -1294
- phoenix/server/templates/index.html +1 -0
- phoenix/services.py +15 -15
- phoenix/session/client.py +581 -22
- phoenix/session/session.py +47 -37
- phoenix/trace/exporter.py +14 -9
- phoenix/trace/fixtures.py +133 -7
- phoenix/trace/schemas.py +1 -2
- phoenix/trace/span_evaluations.py +3 -3
- phoenix/trace/trace_dataset.py +6 -6
- phoenix/utilities/json.py +61 -0
- phoenix/utilities/re.py +50 -0
- phoenix/version.py +1 -1
- phoenix/server/api/types/DatasetRole.py +0 -23
- {arize_phoenix-4.5.0.dist-info → arize_phoenix-4.6.1.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-4.5.0.dist-info → arize_phoenix-4.6.1.dist-info}/licenses/LICENSE +0 -0
- /phoenix/server/api/{helpers.py → helpers/__init__.py} +0 -0
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import inspect
|
|
3
|
+
from abc import ABC
|
|
4
|
+
from types import MappingProxyType
|
|
5
|
+
from typing import Any, Awaitable, Callable, Optional, Union
|
|
6
|
+
|
|
7
|
+
from typing_extensions import TypeAlias
|
|
8
|
+
|
|
9
|
+
from phoenix.experiments.evaluators.utils import validate_evaluator_signature
|
|
10
|
+
from phoenix.experiments.types import (
|
|
11
|
+
AnnotatorKind,
|
|
12
|
+
EvaluationResult,
|
|
13
|
+
EvaluatorKind,
|
|
14
|
+
EvaluatorName,
|
|
15
|
+
EvaluatorOutput,
|
|
16
|
+
ExampleInput,
|
|
17
|
+
ExampleMetadata,
|
|
18
|
+
ExampleOutput,
|
|
19
|
+
TaskOutput,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Evaluator(ABC):
|
|
24
|
+
"""
|
|
25
|
+
A helper super class to guide the implementation of an `Evaluator` object.
|
|
26
|
+
Subclasses must implement either the `evaluate` or `async_evaluate` method.
|
|
27
|
+
Implementing both methods is recommended, but not required.
|
|
28
|
+
|
|
29
|
+
This Class is intended to be subclassed, and should not be instantiated directly.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
_kind: AnnotatorKind
|
|
33
|
+
_name: EvaluatorName
|
|
34
|
+
|
|
35
|
+
@functools.cached_property
|
|
36
|
+
def name(self) -> EvaluatorName:
|
|
37
|
+
if hasattr(self, "_name"):
|
|
38
|
+
return self._name
|
|
39
|
+
return self.__class__.__name__
|
|
40
|
+
|
|
41
|
+
@functools.cached_property
|
|
42
|
+
def kind(self) -> EvaluatorKind:
|
|
43
|
+
if hasattr(self, "_kind"):
|
|
44
|
+
return self._kind.value
|
|
45
|
+
return AnnotatorKind.CODE.value
|
|
46
|
+
|
|
47
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> "Evaluator":
|
|
48
|
+
if cls is Evaluator:
|
|
49
|
+
raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
|
|
50
|
+
return object.__new__(cls)
|
|
51
|
+
|
|
52
|
+
def evaluate(
|
|
53
|
+
self,
|
|
54
|
+
*,
|
|
55
|
+
output: Optional[TaskOutput] = None,
|
|
56
|
+
expected: Optional[ExampleOutput] = None,
|
|
57
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
58
|
+
input: ExampleInput = MappingProxyType({}),
|
|
59
|
+
**kwargs: Any,
|
|
60
|
+
) -> EvaluationResult:
|
|
61
|
+
# For subclassing, one should implement either this sync method or the
|
|
62
|
+
# async version. Implementing both is recommended but not required.
|
|
63
|
+
raise NotImplementedError
|
|
64
|
+
|
|
65
|
+
async def async_evaluate(
|
|
66
|
+
self,
|
|
67
|
+
*,
|
|
68
|
+
output: Optional[TaskOutput] = None,
|
|
69
|
+
expected: Optional[ExampleOutput] = None,
|
|
70
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
71
|
+
input: ExampleInput = MappingProxyType({}),
|
|
72
|
+
**kwargs: Any,
|
|
73
|
+
) -> EvaluationResult:
|
|
74
|
+
# For subclassing, one should implement either this async method or the
|
|
75
|
+
# sync version. Implementing both is recommended but not required.
|
|
76
|
+
return self.evaluate(
|
|
77
|
+
output=output,
|
|
78
|
+
expected=expected,
|
|
79
|
+
metadata=metadata,
|
|
80
|
+
input=input,
|
|
81
|
+
**kwargs,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def __init_subclass__(cls, is_abstract: bool = False, **kwargs: Any) -> None:
|
|
85
|
+
super().__init_subclass__(**kwargs)
|
|
86
|
+
if is_abstract:
|
|
87
|
+
return
|
|
88
|
+
evaluate_fn_signature = inspect.signature(Evaluator.evaluate)
|
|
89
|
+
for super_cls in inspect.getmro(cls):
|
|
90
|
+
if super_cls in (LLMEvaluator, Evaluator):
|
|
91
|
+
break
|
|
92
|
+
if evaluate := super_cls.__dict__.get(Evaluator.evaluate.__name__):
|
|
93
|
+
assert callable(evaluate), "`evaluate()` method should be callable"
|
|
94
|
+
# need to remove the first param, i.e. `self`
|
|
95
|
+
_validate_sig(functools.partial(evaluate, None), "evaluate")
|
|
96
|
+
return
|
|
97
|
+
if async_evaluate := super_cls.__dict__.get(Evaluator.async_evaluate.__name__):
|
|
98
|
+
assert callable(async_evaluate), "`async_evaluate()` method should be callable"
|
|
99
|
+
# need to remove the first param, i.e. `self`
|
|
100
|
+
_validate_sig(functools.partial(async_evaluate, None), "async_evaluate")
|
|
101
|
+
return
|
|
102
|
+
raise ValueError(
|
|
103
|
+
f"Evaluator must implement either "
|
|
104
|
+
f"`def evaluate{evaluate_fn_signature}` or "
|
|
105
|
+
f"`async def async_evaluate{evaluate_fn_signature}`"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _validate_sig(fn: Callable[..., Any], fn_name: str) -> None:
|
|
110
|
+
sig = inspect.signature(fn)
|
|
111
|
+
validate_evaluator_signature(sig)
|
|
112
|
+
for param in sig.parameters.values():
|
|
113
|
+
if param.kind is inspect.Parameter.VAR_KEYWORD:
|
|
114
|
+
return
|
|
115
|
+
else:
|
|
116
|
+
raise ValueError(f"`{fn_name}` should allow variadic keyword arguments `**kwargs`")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class CodeEvaluator(Evaluator, ABC, is_abstract=True):
|
|
120
|
+
"""
|
|
121
|
+
A convenience super class for defining code evaluators.
|
|
122
|
+
|
|
123
|
+
This class is intended to be subclassed, and should not be instantiated directly.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
_kind = AnnotatorKind.CODE
|
|
127
|
+
|
|
128
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> "CodeEvaluator":
|
|
129
|
+
if cls is CodeEvaluator:
|
|
130
|
+
raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
|
|
131
|
+
return object.__new__(cls)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class LLMEvaluator(Evaluator, ABC, is_abstract=True):
|
|
135
|
+
"""
|
|
136
|
+
A convenience super class for defining LLM evaluators.
|
|
137
|
+
|
|
138
|
+
This class is intended to be subclassed, and should not be instantiated directly.
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
_kind = AnnotatorKind.LLM
|
|
142
|
+
|
|
143
|
+
def __new__(cls, *args: Any, **kwargs: Any) -> "LLMEvaluator":
|
|
144
|
+
if cls is LLMEvaluator:
|
|
145
|
+
raise TypeError(f"{cls.__name__} is an abstract class and should not be instantiated.")
|
|
146
|
+
return object.__new__(cls)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
ExperimentEvaluator: TypeAlias = Union[
|
|
150
|
+
Evaluator,
|
|
151
|
+
Callable[..., EvaluatorOutput],
|
|
152
|
+
Callable[..., Awaitable[EvaluatorOutput]],
|
|
153
|
+
]
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
from phoenix.experiments.evaluators.base import CodeEvaluator
|
|
8
|
+
from phoenix.experiments.types import EvaluationResult, TaskOutput
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class JSONParsable(CodeEvaluator):
|
|
12
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
13
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
14
|
+
try:
|
|
15
|
+
json.loads(output)
|
|
16
|
+
json_parsable = True
|
|
17
|
+
except BaseException:
|
|
18
|
+
json_parsable = False
|
|
19
|
+
return EvaluationResult(
|
|
20
|
+
score=int(json_parsable),
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ContainsKeyword(CodeEvaluator):
|
|
25
|
+
def __init__(self, keyword: str, name: Optional[str] = None) -> None:
|
|
26
|
+
self.keyword = keyword
|
|
27
|
+
self._name = name or f"Contains({repr(keyword)})"
|
|
28
|
+
|
|
29
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
30
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
31
|
+
found = self.keyword in output
|
|
32
|
+
return EvaluationResult(
|
|
33
|
+
score=float(found),
|
|
34
|
+
explanation=(
|
|
35
|
+
f"the string {repr(self.keyword)} was "
|
|
36
|
+
f"{'found' if found else 'not found'} in the output"
|
|
37
|
+
),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ContainsAnyKeyword(CodeEvaluator):
|
|
42
|
+
def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
|
|
43
|
+
self.keywords = keywords
|
|
44
|
+
self._name = name or f"ContainsAny({keywords})"
|
|
45
|
+
|
|
46
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
47
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
48
|
+
found = [keyword for keyword in self.keywords if keyword in output]
|
|
49
|
+
if found:
|
|
50
|
+
explanation = f"the keywords {found} were found in the output"
|
|
51
|
+
else:
|
|
52
|
+
explanation = f"none of the keywords {self.keywords} were found in the output"
|
|
53
|
+
return EvaluationResult(
|
|
54
|
+
score=float(bool(found)),
|
|
55
|
+
explanation=explanation,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ContainsAllKeywords(CodeEvaluator):
|
|
60
|
+
def __init__(self, keywords: List[str], name: Optional[str] = None) -> None:
|
|
61
|
+
self.keywords = keywords
|
|
62
|
+
self._name = name or f"ContainsAll({keywords})"
|
|
63
|
+
|
|
64
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
65
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
66
|
+
not_found = [keyword for keyword in self.keywords if keyword not in output]
|
|
67
|
+
if not_found:
|
|
68
|
+
contains_all = False
|
|
69
|
+
explanation = f"the keywords {not_found} were not found in the output"
|
|
70
|
+
else:
|
|
71
|
+
contains_all = True
|
|
72
|
+
explanation = f"all of the keywords {self.keywords} were found in the output"
|
|
73
|
+
return EvaluationResult(
|
|
74
|
+
score=float(contains_all),
|
|
75
|
+
explanation=explanation,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class MatchesRegex(CodeEvaluator):
|
|
80
|
+
def __init__(self, pattern: Union[str, re.Pattern[str]], name: Optional[str] = None) -> None:
|
|
81
|
+
if isinstance(pattern, str):
|
|
82
|
+
pattern = re.compile(pattern)
|
|
83
|
+
self.pattern = pattern
|
|
84
|
+
assert isinstance(pattern, re.Pattern)
|
|
85
|
+
self._name = name or f"matches_({pattern})"
|
|
86
|
+
|
|
87
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
88
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
89
|
+
matches = self.pattern.findall(output)
|
|
90
|
+
if matches:
|
|
91
|
+
explanation = (
|
|
92
|
+
f"the substrings {matches} matched the regex pattern {self.pattern.pattern}"
|
|
93
|
+
)
|
|
94
|
+
else:
|
|
95
|
+
explanation = f"no substrings matched the regex pattern {self.pattern.pattern}"
|
|
96
|
+
return EvaluationResult(
|
|
97
|
+
score=float(bool(matches)),
|
|
98
|
+
explanation=explanation,
|
|
99
|
+
)
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from types import MappingProxyType
|
|
3
|
+
from typing import Any, Callable, Optional, Type
|
|
4
|
+
|
|
5
|
+
from phoenix.evals.models.base import BaseModel as LLMBaseModel
|
|
6
|
+
from phoenix.evals.utils import snap_to_rail
|
|
7
|
+
from phoenix.experiments.evaluators.base import (
|
|
8
|
+
ExperimentEvaluator,
|
|
9
|
+
LLMEvaluator,
|
|
10
|
+
)
|
|
11
|
+
from phoenix.experiments.evaluators.utils import unwrap_json
|
|
12
|
+
from phoenix.experiments.types import (
|
|
13
|
+
EvaluationResult,
|
|
14
|
+
ExampleInput,
|
|
15
|
+
ExampleMetadata,
|
|
16
|
+
TaskOutput,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class LLMCriteriaEvaluator(LLMEvaluator):
|
|
21
|
+
_base_template = (
|
|
22
|
+
"Determine if the following text is {criteria}. {description}"
|
|
23
|
+
"First, explain step-by-step why you think the text is or is not {criteria}. Then provide "
|
|
24
|
+
"a single word label; 'true' if the text is {criteria} or 'false' if the text is not "
|
|
25
|
+
"{criteria}. Here is an example template for whether the text meets a criteria:\n\n"
|
|
26
|
+
"CRITERIA: the text is '{criteria}'\n"
|
|
27
|
+
"TEXT: *the provided text to evaluate*\n"
|
|
28
|
+
"EXPLANATION: *a step by step explanation of your reasoning for whether the text meets "
|
|
29
|
+
"the criteria*\n"
|
|
30
|
+
"LABEL: *true or false*\n\n"
|
|
31
|
+
"Follow this template for the following example:\n\n"
|
|
32
|
+
"CRITERIA: the text is '{criteria}'\n"
|
|
33
|
+
"TEXT: {text}\n"
|
|
34
|
+
"EXPLANATION: "
|
|
35
|
+
)
|
|
36
|
+
_description = "In this context, '{criteria}' means the text '{description}'. "
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
model: LLMBaseModel,
|
|
41
|
+
criteria: str,
|
|
42
|
+
description: str,
|
|
43
|
+
name: str,
|
|
44
|
+
):
|
|
45
|
+
self.model = model
|
|
46
|
+
self.criteria = criteria
|
|
47
|
+
self.description = description
|
|
48
|
+
self.template = self._format_base_template(self.criteria, self.description)
|
|
49
|
+
self._name = name
|
|
50
|
+
|
|
51
|
+
def evaluate(self, *, output: Optional[TaskOutput] = None, **_: Any) -> EvaluationResult:
|
|
52
|
+
formatted_template = self._format_eval_template(output)
|
|
53
|
+
unparsed_response = self.model._generate(formatted_template)
|
|
54
|
+
return self._parse_eval_output(unparsed_response)
|
|
55
|
+
|
|
56
|
+
async def async_evaluate(
|
|
57
|
+
self, *, output: Optional[TaskOutput] = None, **_: Any
|
|
58
|
+
) -> EvaluationResult:
|
|
59
|
+
formatted_template = self._format_eval_template(output)
|
|
60
|
+
unparsed_response = await self.model._async_generate(formatted_template)
|
|
61
|
+
return self._parse_eval_output(unparsed_response)
|
|
62
|
+
|
|
63
|
+
def _format_eval_template(self, output: TaskOutput) -> str:
|
|
64
|
+
assert output is not None
|
|
65
|
+
result = unwrap_json(output)
|
|
66
|
+
return self.template.format(text=str(result))
|
|
67
|
+
|
|
68
|
+
def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
|
|
69
|
+
raw_label, explanation = (
|
|
70
|
+
_parse_label_from_explanation(unparsed_response),
|
|
71
|
+
unparsed_response,
|
|
72
|
+
)
|
|
73
|
+
label = snap_to_rail(raw_label, ["true", "false"])
|
|
74
|
+
if label == "true":
|
|
75
|
+
score = 1.0
|
|
76
|
+
elif label == "false":
|
|
77
|
+
score = 0.0
|
|
78
|
+
else:
|
|
79
|
+
raise RuntimeError(f"Could not parse LLM evaluation: {unparsed_response}")
|
|
80
|
+
return EvaluationResult(
|
|
81
|
+
score=score,
|
|
82
|
+
explanation=explanation,
|
|
83
|
+
metadata={},
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def _format_base_template(cls, criteria: str, description: Optional[str] = None) -> str:
|
|
88
|
+
formatted_description = cls._description.format(criteria=criteria, description=description)
|
|
89
|
+
formatted_template = cls._base_template.format(
|
|
90
|
+
criteria=criteria,
|
|
91
|
+
description=formatted_description,
|
|
92
|
+
text="{text}", # leave the text field as a placeholder
|
|
93
|
+
)
|
|
94
|
+
return formatted_template
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def criteria_evaluator_factory(
|
|
98
|
+
class_name: str, criteria: str, description: str, default_name: str
|
|
99
|
+
) -> Type[ExperimentEvaluator]:
|
|
100
|
+
def _init(self, model: LLMBaseModel, name: str = default_name) -> None: # type: ignore
|
|
101
|
+
LLMCriteriaEvaluator.__init__(self, model, criteria, description, name=name)
|
|
102
|
+
|
|
103
|
+
return type(
|
|
104
|
+
class_name,
|
|
105
|
+
(LLMCriteriaEvaluator,),
|
|
106
|
+
{
|
|
107
|
+
"__init__": _init,
|
|
108
|
+
"__module__": __name__,
|
|
109
|
+
"template": LLMCriteriaEvaluator._format_base_template(criteria, description),
|
|
110
|
+
},
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
ConcisenessEvaluator = criteria_evaluator_factory(
|
|
115
|
+
class_name="ConcisenessEvaluator",
|
|
116
|
+
criteria="concise",
|
|
117
|
+
description="is just a few sentences and easy to follow",
|
|
118
|
+
default_name="Conciseness",
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
HelpfulnessEvaluator = criteria_evaluator_factory(
|
|
123
|
+
class_name="HelpfulnessEvaluator",
|
|
124
|
+
criteria="helpful",
|
|
125
|
+
description="provides useful information",
|
|
126
|
+
default_name="Helpfulness",
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
CoherenceEvaluator = criteria_evaluator_factory(
|
|
131
|
+
class_name="CoherenceEvaluator",
|
|
132
|
+
criteria="coherent",
|
|
133
|
+
description="is coherent, well-structured, and logically sound",
|
|
134
|
+
default_name="Coherence",
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _parse_label_from_explanation(raw_string: str) -> str:
|
|
139
|
+
label_delimiter = r"(\W*label\W*)"
|
|
140
|
+
parts = re.split(label_delimiter, raw_string, flags=re.IGNORECASE)
|
|
141
|
+
if len(parts) > 1:
|
|
142
|
+
# Find the last occurrence of the delimiter and take the part after it
|
|
143
|
+
last_index = len(parts) - 1
|
|
144
|
+
while last_index > 0:
|
|
145
|
+
if re.match(label_delimiter, parts[last_index - 1], flags=re.IGNORECASE):
|
|
146
|
+
return parts[last_index].strip()
|
|
147
|
+
last_index -= 1
|
|
148
|
+
return raw_string
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class RelevanceEvaluator(LLMEvaluator):
|
|
152
|
+
template = (
|
|
153
|
+
"Determine if the following response is relevant to the query. In this context, "
|
|
154
|
+
"'relevance' means that the response directly addresses the core question or topic of the "
|
|
155
|
+
"query. First, explain step-by-step why you think the text is or is not relevant. "
|
|
156
|
+
"Then provide a single word label; 'true' if the text is relevant or 'false' if the text "
|
|
157
|
+
"is not relevant. "
|
|
158
|
+
"Here is an example template for your reponse:\n\n"
|
|
159
|
+
"CRITERIA: the response is 'relevant' to the query\n"
|
|
160
|
+
"QUERY: *text that contains a query*\n"
|
|
161
|
+
"RESPONSE: *a response that may or may not be relevant to the query*\n"
|
|
162
|
+
"EXPLANATION: *a step by step explanation of your reasoning for whether or not the "
|
|
163
|
+
"response is relevant to the query*\n"
|
|
164
|
+
"LABEL: *true or false*\n\n"
|
|
165
|
+
"Follow this template for the following example:\n\n"
|
|
166
|
+
"CRITERIA: the response is 'relevant' to the query\n"
|
|
167
|
+
"QUERY: {reference}\n"
|
|
168
|
+
"RESPONSE: {submission}\n"
|
|
169
|
+
"EXPLANATION: "
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def __init__(
|
|
173
|
+
self,
|
|
174
|
+
model: LLMBaseModel,
|
|
175
|
+
get_query: Optional[Callable[[ExampleInput, ExampleMetadata], str]] = None,
|
|
176
|
+
get_response: Optional[Callable[[Optional[TaskOutput], ExampleMetadata], str]] = None,
|
|
177
|
+
name: str = "RelevanceEvaluator",
|
|
178
|
+
):
|
|
179
|
+
self.model = model
|
|
180
|
+
self._name = name
|
|
181
|
+
self.get_query = get_query or self._default_get_query
|
|
182
|
+
self.get_response = get_response or self._default_get_response
|
|
183
|
+
|
|
184
|
+
def _format_eval_template(
|
|
185
|
+
self,
|
|
186
|
+
output: Optional[TaskOutput] = None,
|
|
187
|
+
input: ExampleInput = MappingProxyType({}),
|
|
188
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
189
|
+
) -> str:
|
|
190
|
+
assert output is not None
|
|
191
|
+
query = self.get_query(input, metadata)
|
|
192
|
+
response = self.get_response(output, metadata)
|
|
193
|
+
return self.template.format(query=query, response=response)
|
|
194
|
+
|
|
195
|
+
def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
|
|
196
|
+
raw_label, explanation = (
|
|
197
|
+
_parse_label_from_explanation(unparsed_response),
|
|
198
|
+
unparsed_response,
|
|
199
|
+
)
|
|
200
|
+
label = snap_to_rail(raw_label, ["true", "false"])
|
|
201
|
+
if label == "true":
|
|
202
|
+
score = 1.0
|
|
203
|
+
elif label == "false":
|
|
204
|
+
score = 0.0
|
|
205
|
+
else:
|
|
206
|
+
raise RuntimeError(f"Could not parse LLM evaluation: {unparsed_response}")
|
|
207
|
+
return EvaluationResult(
|
|
208
|
+
score=score,
|
|
209
|
+
explanation=explanation,
|
|
210
|
+
metadata={},
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
def _default_get_query(self, input: ExampleInput, *args: Any, **kwargs: Any) -> str:
|
|
214
|
+
return str(input)
|
|
215
|
+
|
|
216
|
+
def _default_get_response(
|
|
217
|
+
self, output: Optional[TaskOutput] = None, *args: Any, **kwargs: Any
|
|
218
|
+
) -> str:
|
|
219
|
+
assert output is not None
|
|
220
|
+
return str(unwrap_json(output))
|
|
221
|
+
|
|
222
|
+
def evaluate(
|
|
223
|
+
self,
|
|
224
|
+
*,
|
|
225
|
+
output: Optional[TaskOutput] = None,
|
|
226
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
227
|
+
input: ExampleInput = MappingProxyType({}),
|
|
228
|
+
**_: Any,
|
|
229
|
+
) -> EvaluationResult:
|
|
230
|
+
formatted_template = self._format_eval_template(output, input, metadata)
|
|
231
|
+
unparsed_response = self.model._generate(formatted_template)
|
|
232
|
+
return self._parse_eval_output(unparsed_response)
|
|
233
|
+
|
|
234
|
+
async def async_evaluate(
|
|
235
|
+
self,
|
|
236
|
+
*,
|
|
237
|
+
output: Optional[TaskOutput] = None,
|
|
238
|
+
metadata: ExampleMetadata = MappingProxyType({}),
|
|
239
|
+
input: ExampleInput = MappingProxyType({}),
|
|
240
|
+
**_: Any,
|
|
241
|
+
) -> EvaluationResult:
|
|
242
|
+
formatted_template = self._format_eval_template(output, input, metadata)
|
|
243
|
+
unparsed_response = await self.model._async_generate(formatted_template)
|
|
244
|
+
return self._parse_eval_output(unparsed_response)
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import inspect
|
|
3
|
+
from itertools import chain, islice, repeat
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
|
|
5
|
+
|
|
6
|
+
from phoenix.experiments.types import (
|
|
7
|
+
AnnotatorKind,
|
|
8
|
+
EvaluationResult,
|
|
9
|
+
JSONSerializable,
|
|
10
|
+
)
|
|
11
|
+
from phoenix.experiments.utils import get_func_name
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from phoenix.experiments.evaluators.base import Evaluator
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def unwrap_json(obj: JSONSerializable) -> JSONSerializable:
|
|
18
|
+
if isinstance(obj, dict):
|
|
19
|
+
if len(obj) == 1:
|
|
20
|
+
key = next(iter(obj.keys()))
|
|
21
|
+
output = obj[key]
|
|
22
|
+
assert isinstance(
|
|
23
|
+
output, (dict, list, str, int, float, bool, type(None))
|
|
24
|
+
), "Output must be JSON serializable"
|
|
25
|
+
return output
|
|
26
|
+
return obj
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def validate_evaluator_signature(sig: inspect.Signature) -> None:
|
|
30
|
+
# Check that the wrapped function has a valid signature for use as an evaluator
|
|
31
|
+
# If it does not, raise an error to exit early before running evaluations
|
|
32
|
+
params = sig.parameters
|
|
33
|
+
valid_named_params = {"input", "output", "expected", "reference", "metadata"}
|
|
34
|
+
if len(params) == 0:
|
|
35
|
+
raise ValueError("Evaluation function must have at least one parameter.")
|
|
36
|
+
if len(params) > 1:
|
|
37
|
+
for not_found in set(params) - valid_named_params:
|
|
38
|
+
param = params[not_found]
|
|
39
|
+
if (
|
|
40
|
+
param.kind is inspect.Parameter.VAR_KEYWORD
|
|
41
|
+
or param.default is not inspect.Parameter.empty
|
|
42
|
+
):
|
|
43
|
+
continue
|
|
44
|
+
raise ValueError(
|
|
45
|
+
(
|
|
46
|
+
f"Invalid parameter names in evaluation function: {', '.join(not_found)}. "
|
|
47
|
+
"Parameters names for multi-argument functions must be "
|
|
48
|
+
f"any of: {', '.join(valid_named_params)}."
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _bind_evaluator_signature(sig: inspect.Signature, **kwargs: Any) -> inspect.BoundArguments:
|
|
54
|
+
parameter_mapping = {
|
|
55
|
+
"input": kwargs.get("input"),
|
|
56
|
+
"output": kwargs.get("output"),
|
|
57
|
+
"expected": kwargs.get("expected"),
|
|
58
|
+
"reference": kwargs.get("reference"), # `reference` is an alias for `expected`
|
|
59
|
+
"metadata": kwargs.get("metadata"),
|
|
60
|
+
}
|
|
61
|
+
params = sig.parameters
|
|
62
|
+
if len(params) == 1:
|
|
63
|
+
parameter_name = next(iter(params))
|
|
64
|
+
if parameter_name in parameter_mapping:
|
|
65
|
+
return sig.bind(parameter_mapping[parameter_name])
|
|
66
|
+
else:
|
|
67
|
+
return sig.bind(parameter_mapping["output"])
|
|
68
|
+
return sig.bind_partial(
|
|
69
|
+
**{name: parameter_mapping[name] for name in set(parameter_mapping).intersection(params)}
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def create_evaluator(
|
|
74
|
+
kind: Union[str, AnnotatorKind] = AnnotatorKind.CODE,
|
|
75
|
+
name: Optional[str] = None,
|
|
76
|
+
scorer: Optional[Callable[[Any], EvaluationResult]] = None,
|
|
77
|
+
) -> Callable[[Callable[..., Any]], "Evaluator"]:
|
|
78
|
+
if scorer is None:
|
|
79
|
+
scorer = _default_eval_scorer
|
|
80
|
+
|
|
81
|
+
if isinstance(kind, str):
|
|
82
|
+
kind = AnnotatorKind(kind.upper())
|
|
83
|
+
|
|
84
|
+
def wrapper(func: Callable[..., Any]) -> "Evaluator":
|
|
85
|
+
nonlocal name
|
|
86
|
+
if not name:
|
|
87
|
+
name = get_func_name(func)
|
|
88
|
+
assert name is not None
|
|
89
|
+
|
|
90
|
+
wrapped_signature = inspect.signature(func)
|
|
91
|
+
validate_evaluator_signature(wrapped_signature)
|
|
92
|
+
|
|
93
|
+
if inspect.iscoroutinefunction(func):
|
|
94
|
+
return _wrap_coroutine_evaluation_function(name, kind, wrapped_signature, scorer)(func)
|
|
95
|
+
else:
|
|
96
|
+
return _wrap_sync_evaluation_function(name, kind, wrapped_signature, scorer)(func)
|
|
97
|
+
|
|
98
|
+
return wrapper
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _wrap_coroutine_evaluation_function(
|
|
102
|
+
name: str,
|
|
103
|
+
annotator_kind: AnnotatorKind,
|
|
104
|
+
sig: inspect.Signature,
|
|
105
|
+
convert_to_score: Callable[[Any], EvaluationResult],
|
|
106
|
+
) -> Callable[[Callable[..., Any]], "Evaluator"]:
|
|
107
|
+
from phoenix.experiments.evaluators.base import Evaluator
|
|
108
|
+
|
|
109
|
+
def wrapper(func: Callable[..., Any]) -> "Evaluator":
|
|
110
|
+
class AsyncEvaluator(Evaluator):
|
|
111
|
+
def __init__(self) -> None:
|
|
112
|
+
self._name = name
|
|
113
|
+
self._kind = annotator_kind
|
|
114
|
+
|
|
115
|
+
@functools.wraps(func)
|
|
116
|
+
async def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
|
117
|
+
return await func(*args, **kwargs)
|
|
118
|
+
|
|
119
|
+
async def async_evaluate(self, **kwargs: Any) -> EvaluationResult:
|
|
120
|
+
bound_signature = _bind_evaluator_signature(sig, **kwargs)
|
|
121
|
+
result = await func(*bound_signature.args, **bound_signature.kwargs)
|
|
122
|
+
return convert_to_score(result)
|
|
123
|
+
|
|
124
|
+
return AsyncEvaluator()
|
|
125
|
+
|
|
126
|
+
return wrapper
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _wrap_sync_evaluation_function(
|
|
130
|
+
name: str,
|
|
131
|
+
annotator_kind: AnnotatorKind,
|
|
132
|
+
sig: inspect.Signature,
|
|
133
|
+
convert_to_score: Callable[[Any], EvaluationResult],
|
|
134
|
+
) -> Callable[[Callable[..., Any]], "Evaluator"]:
|
|
135
|
+
from phoenix.experiments.evaluators.base import Evaluator
|
|
136
|
+
|
|
137
|
+
def wrapper(func: Callable[..., Any]) -> "Evaluator":
|
|
138
|
+
class SyncEvaluator(Evaluator):
|
|
139
|
+
def __init__(self) -> None:
|
|
140
|
+
self._name = name
|
|
141
|
+
self._kind = annotator_kind
|
|
142
|
+
|
|
143
|
+
@functools.wraps(func)
|
|
144
|
+
def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
|
145
|
+
return func(*args, **kwargs)
|
|
146
|
+
|
|
147
|
+
def evaluate(self, **kwargs: Any) -> EvaluationResult:
|
|
148
|
+
bound_signature = _bind_evaluator_signature(sig, **kwargs)
|
|
149
|
+
result = func(*bound_signature.args, **bound_signature.kwargs)
|
|
150
|
+
return convert_to_score(result)
|
|
151
|
+
|
|
152
|
+
return SyncEvaluator()
|
|
153
|
+
|
|
154
|
+
return wrapper
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _default_eval_scorer(result: Any) -> EvaluationResult:
|
|
158
|
+
if isinstance(result, EvaluationResult):
|
|
159
|
+
return result
|
|
160
|
+
if isinstance(result, bool):
|
|
161
|
+
return EvaluationResult(score=float(result), label=str(result))
|
|
162
|
+
if hasattr(result, "__float__"):
|
|
163
|
+
return EvaluationResult(score=float(result))
|
|
164
|
+
if isinstance(result, str):
|
|
165
|
+
return EvaluationResult(label=result)
|
|
166
|
+
if isinstance(result, (tuple, list)) and 0 < len(result) <= 3:
|
|
167
|
+
# Possible interpretations are:
|
|
168
|
+
# - 3-tuple: (Score, Label, Explanation)
|
|
169
|
+
# - 2-tuple: (Score, Explanation) or (Label, Explanation)
|
|
170
|
+
# - 1-tuple: (Score, ) or (Label, )
|
|
171
|
+
# Note that (Score, Label) conflicts with (Score, Explanation) and we
|
|
172
|
+
# pick the latter because it's probably more prevalent. To get
|
|
173
|
+
# (Score, Label), use a 3-tuple instead, i.e. (Score, Label, None).
|
|
174
|
+
a, b, c = islice(chain(result, repeat(None)), 3)
|
|
175
|
+
score, label, explanation = None, a, b
|
|
176
|
+
if hasattr(a, "__float__"):
|
|
177
|
+
try:
|
|
178
|
+
score = float(a)
|
|
179
|
+
except ValueError:
|
|
180
|
+
pass
|
|
181
|
+
else:
|
|
182
|
+
label, explanation = (None, b) if len(result) < 3 else (b, c)
|
|
183
|
+
return EvaluationResult(score=score, label=label, explanation=explanation)
|
|
184
|
+
if result is None:
|
|
185
|
+
return EvaluationResult(score=0)
|
|
186
|
+
raise ValueError(f"Unsupported evaluation result type: {type(result)}")
|