arize-phoenix 4.4.2__py3-none-any.whl → 4.4.4rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/METADATA +12 -11
- {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/RECORD +110 -57
- phoenix/__init__.py +0 -27
- phoenix/config.py +21 -7
- phoenix/core/model.py +25 -25
- phoenix/core/model_schema.py +66 -64
- phoenix/core/model_schema_adapter.py +27 -25
- phoenix/datasets/__init__.py +0 -0
- phoenix/datasets/evaluators.py +275 -0
- phoenix/datasets/experiments.py +469 -0
- phoenix/datasets/tracing.py +66 -0
- phoenix/datasets/types.py +212 -0
- phoenix/db/bulk_inserter.py +54 -14
- phoenix/db/insertion/dataset.py +234 -0
- phoenix/db/insertion/evaluation.py +6 -6
- phoenix/db/insertion/helpers.py +13 -2
- phoenix/db/migrations/types.py +29 -0
- phoenix/db/migrations/versions/10460e46d750_datasets.py +291 -0
- phoenix/db/migrations/versions/cf03bd6bae1d_init.py +2 -28
- phoenix/db/models.py +230 -3
- phoenix/inferences/fixtures.py +23 -23
- phoenix/inferences/inferences.py +7 -7
- phoenix/inferences/validation.py +1 -1
- phoenix/metrics/binning.py +2 -2
- phoenix/server/api/context.py +16 -0
- phoenix/server/api/dataloaders/__init__.py +16 -0
- phoenix/server/api/dataloaders/dataset_example_revisions.py +100 -0
- phoenix/server/api/dataloaders/dataset_example_spans.py +43 -0
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +85 -0
- phoenix/server/api/dataloaders/experiment_error_rates.py +43 -0
- phoenix/server/api/dataloaders/experiment_sequence_number.py +49 -0
- phoenix/server/api/dataloaders/project_by_name.py +31 -0
- phoenix/server/api/dataloaders/span_descendants.py +2 -3
- phoenix/server/api/dataloaders/span_projects.py +33 -0
- phoenix/server/api/dataloaders/trace_row_ids.py +39 -0
- phoenix/server/api/helpers/dataset_helpers.py +178 -0
- phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
- phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
- phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
- phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
- phoenix/server/api/input_types/DatasetSort.py +17 -0
- phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
- phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
- phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
- phoenix/server/api/input_types/DeleteExperimentsInput.py +9 -0
- phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
- phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
- phoenix/server/api/mutations/__init__.py +13 -0
- phoenix/server/api/mutations/auth.py +11 -0
- phoenix/server/api/mutations/dataset_mutations.py +520 -0
- phoenix/server/api/mutations/experiment_mutations.py +65 -0
- phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +17 -14
- phoenix/server/api/mutations/project_mutations.py +42 -0
- phoenix/server/api/queries.py +503 -0
- phoenix/server/api/routers/v1/__init__.py +77 -2
- phoenix/server/api/routers/v1/dataset_examples.py +178 -0
- phoenix/server/api/routers/v1/datasets.py +861 -0
- phoenix/server/api/routers/v1/evaluations.py +4 -2
- phoenix/server/api/routers/v1/experiment_evaluations.py +65 -0
- phoenix/server/api/routers/v1/experiment_runs.py +108 -0
- phoenix/server/api/routers/v1/experiments.py +174 -0
- phoenix/server/api/routers/v1/spans.py +3 -1
- phoenix/server/api/routers/v1/traces.py +1 -4
- phoenix/server/api/schema.py +2 -303
- phoenix/server/api/types/AnnotatorKind.py +10 -0
- phoenix/server/api/types/Cluster.py +19 -19
- phoenix/server/api/types/CreateDatasetPayload.py +8 -0
- phoenix/server/api/types/Dataset.py +282 -63
- phoenix/server/api/types/DatasetExample.py +85 -0
- phoenix/server/api/types/DatasetExampleRevision.py +34 -0
- phoenix/server/api/types/DatasetVersion.py +14 -0
- phoenix/server/api/types/Dimension.py +30 -29
- phoenix/server/api/types/EmbeddingDimension.py +40 -34
- phoenix/server/api/types/Event.py +16 -16
- phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
- phoenix/server/api/types/Experiment.py +135 -0
- phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
- phoenix/server/api/types/ExperimentComparison.py +19 -0
- phoenix/server/api/types/ExperimentRun.py +91 -0
- phoenix/server/api/types/ExperimentRunAnnotation.py +57 -0
- phoenix/server/api/types/Inferences.py +80 -0
- phoenix/server/api/types/InferencesRole.py +23 -0
- phoenix/server/api/types/Model.py +43 -42
- phoenix/server/api/types/Project.py +26 -12
- phoenix/server/api/types/Segments.py +1 -1
- phoenix/server/api/types/Span.py +78 -2
- phoenix/server/api/types/TimeSeries.py +6 -6
- phoenix/server/api/types/Trace.py +15 -4
- phoenix/server/api/types/UMAPPoints.py +1 -1
- phoenix/server/api/types/node.py +5 -111
- phoenix/server/api/types/pagination.py +10 -52
- phoenix/server/app.py +99 -49
- phoenix/server/main.py +49 -27
- phoenix/server/openapi/docs.py +3 -0
- phoenix/server/static/index.js +2246 -1368
- phoenix/server/templates/index.html +1 -0
- phoenix/services.py +15 -15
- phoenix/session/client.py +316 -21
- phoenix/session/session.py +47 -37
- phoenix/trace/exporter.py +14 -9
- phoenix/trace/fixtures.py +133 -7
- phoenix/trace/span_evaluations.py +3 -3
- phoenix/trace/trace_dataset.py +6 -6
- phoenix/utilities/json.py +61 -0
- phoenix/utilities/re.py +50 -0
- phoenix/version.py +1 -1
- phoenix/server/api/types/DatasetRole.py +0 -23
- {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/WHEEL +0 -0
- {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/licenses/LICENSE +0 -0
- /phoenix/server/api/{helpers.py → helpers/__init__.py} +0 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from typing import TYPE_CHECKING, Callable, Optional, Type
|
|
4
|
+
|
|
5
|
+
from phoenix.datasets.types import (
|
|
6
|
+
EvaluationResult,
|
|
7
|
+
Example,
|
|
8
|
+
ExperimentEvaluator,
|
|
9
|
+
ExperimentRun,
|
|
10
|
+
JSONSerializable,
|
|
11
|
+
)
|
|
12
|
+
from phoenix.evals.models.base import BaseModel as LLMBaseModel
|
|
13
|
+
from phoenix.evals.utils import snap_to_rail
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _unwrap_json(obj: JSONSerializable) -> JSONSerializable:
|
|
17
|
+
if isinstance(obj, dict):
|
|
18
|
+
if len(obj) == 1:
|
|
19
|
+
key = next(iter(obj.keys()))
|
|
20
|
+
output = obj[key]
|
|
21
|
+
assert isinstance(
|
|
22
|
+
output, (dict, list, str, int, float, bool, type(None))
|
|
23
|
+
), "Output must be JSON serializable"
|
|
24
|
+
return output
|
|
25
|
+
return obj
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class JSONParsable:
|
|
29
|
+
annotator_kind = "CODE"
|
|
30
|
+
name = "JSONParsable"
|
|
31
|
+
|
|
32
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
33
|
+
assert exp_run.output is not None
|
|
34
|
+
output = _unwrap_json(exp_run.output.result)
|
|
35
|
+
assert isinstance(output, str), "Experiment run output must be a string"
|
|
36
|
+
try:
|
|
37
|
+
json.loads(output)
|
|
38
|
+
json_parsable = True
|
|
39
|
+
except BaseException:
|
|
40
|
+
json_parsable = False
|
|
41
|
+
return EvaluationResult(
|
|
42
|
+
score=int(json_parsable),
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ContainsKeyword:
|
|
47
|
+
annotator_kind = "CODE"
|
|
48
|
+
|
|
49
|
+
def __init__(self, keyword: str) -> None:
|
|
50
|
+
super().__init__()
|
|
51
|
+
self.keyword = keyword
|
|
52
|
+
self.name = f"ContainsKeyword({keyword})"
|
|
53
|
+
|
|
54
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
55
|
+
assert exp_run.output is not None
|
|
56
|
+
result = _unwrap_json(exp_run.output.result)
|
|
57
|
+
assert isinstance(result, str), "Experiment run output must be a string"
|
|
58
|
+
found = self.keyword in result
|
|
59
|
+
return EvaluationResult(
|
|
60
|
+
score=float(found),
|
|
61
|
+
explanation=(
|
|
62
|
+
f"the string {repr(self.keyword)} was "
|
|
63
|
+
f"{'found' if found else 'not found'} in the output"
|
|
64
|
+
),
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class LLMCriteriaEvaluator:
|
|
69
|
+
annotator_kind = "LLM"
|
|
70
|
+
_base_template = (
|
|
71
|
+
"Determine if the following text is {criteria}. {description}"
|
|
72
|
+
"First, explain step-by-step why you think the text is or is not {criteria}. Then provide "
|
|
73
|
+
"a single word label; 'true' if the text is {criteria} or 'false' if the text is not "
|
|
74
|
+
"{criteria}. Here is an example template for whether the text meets a criteria:\n\n"
|
|
75
|
+
"CRITERIA: the text is '{criteria}'\n"
|
|
76
|
+
"TEXT: *the provided text to evaluate*\n"
|
|
77
|
+
"EXPLANATION: *a step by step explanation of your reasoning for whether the text meets "
|
|
78
|
+
"the criteria*\n"
|
|
79
|
+
"LABEL: *true or false*\n\n"
|
|
80
|
+
"Follow this template for the following text:\n\n"
|
|
81
|
+
"CRITERIA: the text is '{criteria}'\n"
|
|
82
|
+
"TEXT: {text}\n"
|
|
83
|
+
"EXPLANATION: "
|
|
84
|
+
)
|
|
85
|
+
_description = "In this context, '{criteria}' means the text '{description}'. "
|
|
86
|
+
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
model: LLMBaseModel,
|
|
90
|
+
criteria: str,
|
|
91
|
+
description: str,
|
|
92
|
+
name: str,
|
|
93
|
+
):
|
|
94
|
+
self.model = model
|
|
95
|
+
self.criteria = criteria
|
|
96
|
+
self.description = description
|
|
97
|
+
self.template = self._format_base_template(self.criteria, self.description)
|
|
98
|
+
self.name = name
|
|
99
|
+
|
|
100
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
101
|
+
formatted_template = self._format_eval_template(exp_run)
|
|
102
|
+
unparsed_response = self.model._generate(formatted_template)
|
|
103
|
+
return self._parse_eval_output(unparsed_response)
|
|
104
|
+
|
|
105
|
+
async def async_evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
106
|
+
formatted_template = self._format_eval_template(exp_run)
|
|
107
|
+
unparsed_response = await self.model._async_generate(formatted_template)
|
|
108
|
+
return self._parse_eval_output(unparsed_response)
|
|
109
|
+
|
|
110
|
+
def _format_eval_template(self, experiment_run: ExperimentRun) -> str:
|
|
111
|
+
assert experiment_run.output is not None
|
|
112
|
+
result = _unwrap_json(experiment_run.output.result)
|
|
113
|
+
return self.template.format(text=str(result))
|
|
114
|
+
|
|
115
|
+
def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
|
|
116
|
+
raw_label, explanation = (
|
|
117
|
+
_parse_label_from_explanation(unparsed_response),
|
|
118
|
+
unparsed_response,
|
|
119
|
+
)
|
|
120
|
+
label = snap_to_rail(raw_label, ["true", "false"])
|
|
121
|
+
if label == "true":
|
|
122
|
+
score = 1.0
|
|
123
|
+
elif label == "false":
|
|
124
|
+
score = 0.0
|
|
125
|
+
else:
|
|
126
|
+
raise RuntimeError(f"Could not parse LLM evaluation: {unparsed_response}")
|
|
127
|
+
return EvaluationResult(
|
|
128
|
+
score=score,
|
|
129
|
+
explanation=explanation,
|
|
130
|
+
metadata={},
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
@classmethod
|
|
134
|
+
def _format_base_template(cls, criteria: str, description: Optional[str] = None) -> str:
|
|
135
|
+
formatted_description = cls._description.format(criteria=criteria, description=description)
|
|
136
|
+
formatted_template = cls._base_template.format(
|
|
137
|
+
criteria=criteria,
|
|
138
|
+
description=formatted_description,
|
|
139
|
+
text="{text}", # leave the text field as a placeholder
|
|
140
|
+
)
|
|
141
|
+
return formatted_template
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def criteria_evaluator_factory(
|
|
145
|
+
class_name: str, criteria: str, description: str
|
|
146
|
+
) -> Type[ExperimentEvaluator]:
|
|
147
|
+
return type(
|
|
148
|
+
class_name,
|
|
149
|
+
(LLMCriteriaEvaluator,),
|
|
150
|
+
{
|
|
151
|
+
"__init__": lambda self, model: LLMCriteriaEvaluator.__init__(
|
|
152
|
+
self, model, criteria, description, name=class_name
|
|
153
|
+
),
|
|
154
|
+
"__module__": __name__,
|
|
155
|
+
"name": class_name,
|
|
156
|
+
"template": LLMCriteriaEvaluator._format_base_template(criteria, description),
|
|
157
|
+
},
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
LLMConcisenessEvaluator = criteria_evaluator_factory(
|
|
162
|
+
class_name="LLMConcisenessEvaluator",
|
|
163
|
+
criteria="concise",
|
|
164
|
+
description="is just a few sentences and easy to follow",
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
LLMHelpfulnessEvaluator = criteria_evaluator_factory(
|
|
169
|
+
class_name="LLMHelpfulnessEvaluator",
|
|
170
|
+
criteria="helpful",
|
|
171
|
+
description="provides useful information",
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
LLMCoherenceEvaluator = criteria_evaluator_factory(
|
|
176
|
+
class_name="LLMCoherenceEvaluator",
|
|
177
|
+
criteria="coherent",
|
|
178
|
+
description="is coherent, well-structured, and organized",
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _parse_label_from_explanation(raw_string: str) -> str:
|
|
183
|
+
label_delimiter = r"(\W*label\W*)"
|
|
184
|
+
parts = re.split(label_delimiter, raw_string, flags=re.IGNORECASE)
|
|
185
|
+
if len(parts) > 1:
|
|
186
|
+
# Find the last occurrence of the delimiter and take the part after it
|
|
187
|
+
last_index = len(parts) - 1
|
|
188
|
+
while last_index > 0:
|
|
189
|
+
if re.match(label_delimiter, parts[last_index - 1], flags=re.IGNORECASE):
|
|
190
|
+
return parts[last_index].strip()
|
|
191
|
+
last_index -= 1
|
|
192
|
+
return raw_string
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class RelevanceEvaluator:
|
|
196
|
+
annotator_kind = "LLM"
|
|
197
|
+
template = (
|
|
198
|
+
"Determine if the following response is relevant to the query. In this context, "
|
|
199
|
+
"'relevance' means that the response directly addresses the core question or topic of the "
|
|
200
|
+
"query. First, explain step-by-step why you think the text is or is not relevant. "
|
|
201
|
+
"Then provide a single word label; 'true' if the text is relevant or 'false' if the text "
|
|
202
|
+
"is not relevant. "
|
|
203
|
+
"Here is an example template for your reponse:\n\n"
|
|
204
|
+
"CRITERIA: the response is 'relevant' to the query\n"
|
|
205
|
+
"QUERY: *text that contains a query*\n"
|
|
206
|
+
"RESPONSE: *a response that may or may not be relevant to the query*\n"
|
|
207
|
+
"EXPLANATION: *a step by step explanation of your reasoning for whether or not the "
|
|
208
|
+
"response is relevant to the query*\n"
|
|
209
|
+
"LABEL: *true or false*\n\n"
|
|
210
|
+
"Follow this template for the following example:\n\n"
|
|
211
|
+
"CRITERIA: the response is 'relevant' to the query\n"
|
|
212
|
+
"QUERY: {reference}\n"
|
|
213
|
+
"RESPONSE: {submission}\n"
|
|
214
|
+
"EXPLANATION: "
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
def __init__(
|
|
218
|
+
self,
|
|
219
|
+
model: LLMBaseModel,
|
|
220
|
+
get_query: Optional[Callable[[Example, ExperimentRun], str]] = None,
|
|
221
|
+
get_response: Optional[Callable[[Example, ExperimentRun], str]] = None,
|
|
222
|
+
name: str = "RelevanceEvaluator",
|
|
223
|
+
):
|
|
224
|
+
self.model = model
|
|
225
|
+
self.name = name
|
|
226
|
+
self.get_query = get_query or self._default_get_query
|
|
227
|
+
self.get_response = get_response or self._default_get_response
|
|
228
|
+
|
|
229
|
+
def _format_eval_template(self, example: Example, experiment_run: ExperimentRun) -> str:
|
|
230
|
+
assert experiment_run.output is not None
|
|
231
|
+
query = self.get_query(example, experiment_run)
|
|
232
|
+
response = self.get_response(example, experiment_run)
|
|
233
|
+
return self.template.format(query=query, response=response)
|
|
234
|
+
|
|
235
|
+
def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
|
|
236
|
+
raw_label, explanation = (
|
|
237
|
+
_parse_label_from_explanation(unparsed_response),
|
|
238
|
+
unparsed_response,
|
|
239
|
+
)
|
|
240
|
+
label = snap_to_rail(raw_label, ["true", "false"])
|
|
241
|
+
if label == "true":
|
|
242
|
+
score = 1.0
|
|
243
|
+
elif label == "false":
|
|
244
|
+
score = 0.0
|
|
245
|
+
else:
|
|
246
|
+
raise RuntimeError(f"Could not parse LLM evaluation: {unparsed_response}")
|
|
247
|
+
return EvaluationResult(
|
|
248
|
+
score=score,
|
|
249
|
+
explanation=explanation,
|
|
250
|
+
metadata={},
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
def _default_get_query(self, example: Example, experiment_run: ExperimentRun) -> str:
|
|
254
|
+
return str(example.input)
|
|
255
|
+
|
|
256
|
+
def _default_get_response(self, example: Example, experiment_run: ExperimentRun) -> str:
|
|
257
|
+
assert experiment_run.output is not None
|
|
258
|
+
return str(_unwrap_json(experiment_run.output.result))
|
|
259
|
+
|
|
260
|
+
def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
261
|
+
formatted_template = self._format_eval_template(example, exp_run)
|
|
262
|
+
unparsed_response = self.model._generate(formatted_template)
|
|
263
|
+
return self._parse_eval_output(unparsed_response)
|
|
264
|
+
|
|
265
|
+
async def async_evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
|
|
266
|
+
formatted_template = self._format_eval_template(example, exp_run)
|
|
267
|
+
unparsed_response = await self.model._async_generate(formatted_template)
|
|
268
|
+
return self._parse_eval_output(unparsed_response)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
# Someday we'll do typing checking in unit tests.
|
|
272
|
+
if TYPE_CHECKING:
|
|
273
|
+
_: ExperimentEvaluator
|
|
274
|
+
_ = JSONParsable()
|
|
275
|
+
_ = ContainsKeyword("test")
|