arize-phoenix 4.4.2__py3-none-any.whl → 4.4.4rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/METADATA +12 -11
  2. {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/RECORD +110 -57
  3. phoenix/__init__.py +0 -27
  4. phoenix/config.py +21 -7
  5. phoenix/core/model.py +25 -25
  6. phoenix/core/model_schema.py +66 -64
  7. phoenix/core/model_schema_adapter.py +27 -25
  8. phoenix/datasets/__init__.py +0 -0
  9. phoenix/datasets/evaluators.py +275 -0
  10. phoenix/datasets/experiments.py +469 -0
  11. phoenix/datasets/tracing.py +66 -0
  12. phoenix/datasets/types.py +212 -0
  13. phoenix/db/bulk_inserter.py +54 -14
  14. phoenix/db/insertion/dataset.py +234 -0
  15. phoenix/db/insertion/evaluation.py +6 -6
  16. phoenix/db/insertion/helpers.py +13 -2
  17. phoenix/db/migrations/types.py +29 -0
  18. phoenix/db/migrations/versions/10460e46d750_datasets.py +291 -0
  19. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +2 -28
  20. phoenix/db/models.py +230 -3
  21. phoenix/inferences/fixtures.py +23 -23
  22. phoenix/inferences/inferences.py +7 -7
  23. phoenix/inferences/validation.py +1 -1
  24. phoenix/metrics/binning.py +2 -2
  25. phoenix/server/api/context.py +16 -0
  26. phoenix/server/api/dataloaders/__init__.py +16 -0
  27. phoenix/server/api/dataloaders/dataset_example_revisions.py +100 -0
  28. phoenix/server/api/dataloaders/dataset_example_spans.py +43 -0
  29. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +85 -0
  30. phoenix/server/api/dataloaders/experiment_error_rates.py +43 -0
  31. phoenix/server/api/dataloaders/experiment_sequence_number.py +49 -0
  32. phoenix/server/api/dataloaders/project_by_name.py +31 -0
  33. phoenix/server/api/dataloaders/span_descendants.py +2 -3
  34. phoenix/server/api/dataloaders/span_projects.py +33 -0
  35. phoenix/server/api/dataloaders/trace_row_ids.py +39 -0
  36. phoenix/server/api/helpers/dataset_helpers.py +178 -0
  37. phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
  38. phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
  39. phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
  40. phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
  41. phoenix/server/api/input_types/DatasetSort.py +17 -0
  42. phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
  43. phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
  44. phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
  45. phoenix/server/api/input_types/DeleteExperimentsInput.py +9 -0
  46. phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
  47. phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
  48. phoenix/server/api/mutations/__init__.py +13 -0
  49. phoenix/server/api/mutations/auth.py +11 -0
  50. phoenix/server/api/mutations/dataset_mutations.py +520 -0
  51. phoenix/server/api/mutations/experiment_mutations.py +65 -0
  52. phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +17 -14
  53. phoenix/server/api/mutations/project_mutations.py +42 -0
  54. phoenix/server/api/queries.py +503 -0
  55. phoenix/server/api/routers/v1/__init__.py +77 -2
  56. phoenix/server/api/routers/v1/dataset_examples.py +178 -0
  57. phoenix/server/api/routers/v1/datasets.py +861 -0
  58. phoenix/server/api/routers/v1/evaluations.py +4 -2
  59. phoenix/server/api/routers/v1/experiment_evaluations.py +65 -0
  60. phoenix/server/api/routers/v1/experiment_runs.py +108 -0
  61. phoenix/server/api/routers/v1/experiments.py +174 -0
  62. phoenix/server/api/routers/v1/spans.py +3 -1
  63. phoenix/server/api/routers/v1/traces.py +1 -4
  64. phoenix/server/api/schema.py +2 -303
  65. phoenix/server/api/types/AnnotatorKind.py +10 -0
  66. phoenix/server/api/types/Cluster.py +19 -19
  67. phoenix/server/api/types/CreateDatasetPayload.py +8 -0
  68. phoenix/server/api/types/Dataset.py +282 -63
  69. phoenix/server/api/types/DatasetExample.py +85 -0
  70. phoenix/server/api/types/DatasetExampleRevision.py +34 -0
  71. phoenix/server/api/types/DatasetVersion.py +14 -0
  72. phoenix/server/api/types/Dimension.py +30 -29
  73. phoenix/server/api/types/EmbeddingDimension.py +40 -34
  74. phoenix/server/api/types/Event.py +16 -16
  75. phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
  76. phoenix/server/api/types/Experiment.py +135 -0
  77. phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
  78. phoenix/server/api/types/ExperimentComparison.py +19 -0
  79. phoenix/server/api/types/ExperimentRun.py +91 -0
  80. phoenix/server/api/types/ExperimentRunAnnotation.py +57 -0
  81. phoenix/server/api/types/Inferences.py +80 -0
  82. phoenix/server/api/types/InferencesRole.py +23 -0
  83. phoenix/server/api/types/Model.py +43 -42
  84. phoenix/server/api/types/Project.py +26 -12
  85. phoenix/server/api/types/Segments.py +1 -1
  86. phoenix/server/api/types/Span.py +78 -2
  87. phoenix/server/api/types/TimeSeries.py +6 -6
  88. phoenix/server/api/types/Trace.py +15 -4
  89. phoenix/server/api/types/UMAPPoints.py +1 -1
  90. phoenix/server/api/types/node.py +5 -111
  91. phoenix/server/api/types/pagination.py +10 -52
  92. phoenix/server/app.py +99 -49
  93. phoenix/server/main.py +49 -27
  94. phoenix/server/openapi/docs.py +3 -0
  95. phoenix/server/static/index.js +2246 -1368
  96. phoenix/server/templates/index.html +1 -0
  97. phoenix/services.py +15 -15
  98. phoenix/session/client.py +316 -21
  99. phoenix/session/session.py +47 -37
  100. phoenix/trace/exporter.py +14 -9
  101. phoenix/trace/fixtures.py +133 -7
  102. phoenix/trace/span_evaluations.py +3 -3
  103. phoenix/trace/trace_dataset.py +6 -6
  104. phoenix/utilities/json.py +61 -0
  105. phoenix/utilities/re.py +50 -0
  106. phoenix/version.py +1 -1
  107. phoenix/server/api/types/DatasetRole.py +0 -23
  108. {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/WHEEL +0 -0
  109. {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/licenses/IP_NOTICE +0 -0
  110. {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/licenses/LICENSE +0 -0
  111. /phoenix/server/api/{helpers.py → helpers/__init__.py} +0 -0
@@ -0,0 +1,275 @@
1
+ import json
2
+ import re
3
+ from typing import TYPE_CHECKING, Callable, Optional, Type
4
+
5
+ from phoenix.datasets.types import (
6
+ EvaluationResult,
7
+ Example,
8
+ ExperimentEvaluator,
9
+ ExperimentRun,
10
+ JSONSerializable,
11
+ )
12
+ from phoenix.evals.models.base import BaseModel as LLMBaseModel
13
+ from phoenix.evals.utils import snap_to_rail
14
+
15
+
16
+ def _unwrap_json(obj: JSONSerializable) -> JSONSerializable:
17
+ if isinstance(obj, dict):
18
+ if len(obj) == 1:
19
+ key = next(iter(obj.keys()))
20
+ output = obj[key]
21
+ assert isinstance(
22
+ output, (dict, list, str, int, float, bool, type(None))
23
+ ), "Output must be JSON serializable"
24
+ return output
25
+ return obj
26
+
27
+
28
+ class JSONParsable:
29
+ annotator_kind = "CODE"
30
+ name = "JSONParsable"
31
+
32
+ def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
33
+ assert exp_run.output is not None
34
+ output = _unwrap_json(exp_run.output.result)
35
+ assert isinstance(output, str), "Experiment run output must be a string"
36
+ try:
37
+ json.loads(output)
38
+ json_parsable = True
39
+ except BaseException:
40
+ json_parsable = False
41
+ return EvaluationResult(
42
+ score=int(json_parsable),
43
+ )
44
+
45
+
46
+ class ContainsKeyword:
47
+ annotator_kind = "CODE"
48
+
49
+ def __init__(self, keyword: str) -> None:
50
+ super().__init__()
51
+ self.keyword = keyword
52
+ self.name = f"ContainsKeyword({keyword})"
53
+
54
+ def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
55
+ assert exp_run.output is not None
56
+ result = _unwrap_json(exp_run.output.result)
57
+ assert isinstance(result, str), "Experiment run output must be a string"
58
+ found = self.keyword in result
59
+ return EvaluationResult(
60
+ score=float(found),
61
+ explanation=(
62
+ f"the string {repr(self.keyword)} was "
63
+ f"{'found' if found else 'not found'} in the output"
64
+ ),
65
+ )
66
+
67
+
68
+ class LLMCriteriaEvaluator:
69
+ annotator_kind = "LLM"
70
+ _base_template = (
71
+ "Determine if the following text is {criteria}. {description}"
72
+ "First, explain step-by-step why you think the text is or is not {criteria}. Then provide "
73
+ "a single word label; 'true' if the text is {criteria} or 'false' if the text is not "
74
+ "{criteria}. Here is an example template for whether the text meets a criteria:\n\n"
75
+ "CRITERIA: the text is '{criteria}'\n"
76
+ "TEXT: *the provided text to evaluate*\n"
77
+ "EXPLANATION: *a step by step explanation of your reasoning for whether the text meets "
78
+ "the criteria*\n"
79
+ "LABEL: *true or false*\n\n"
80
+ "Follow this template for the following text:\n\n"
81
+ "CRITERIA: the text is '{criteria}'\n"
82
+ "TEXT: {text}\n"
83
+ "EXPLANATION: "
84
+ )
85
+ _description = "In this context, '{criteria}' means the text '{description}'. "
86
+
87
+ def __init__(
88
+ self,
89
+ model: LLMBaseModel,
90
+ criteria: str,
91
+ description: str,
92
+ name: str,
93
+ ):
94
+ self.model = model
95
+ self.criteria = criteria
96
+ self.description = description
97
+ self.template = self._format_base_template(self.criteria, self.description)
98
+ self.name = name
99
+
100
+ def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
101
+ formatted_template = self._format_eval_template(exp_run)
102
+ unparsed_response = self.model._generate(formatted_template)
103
+ return self._parse_eval_output(unparsed_response)
104
+
105
+ async def async_evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
106
+ formatted_template = self._format_eval_template(exp_run)
107
+ unparsed_response = await self.model._async_generate(formatted_template)
108
+ return self._parse_eval_output(unparsed_response)
109
+
110
+ def _format_eval_template(self, experiment_run: ExperimentRun) -> str:
111
+ assert experiment_run.output is not None
112
+ result = _unwrap_json(experiment_run.output.result)
113
+ return self.template.format(text=str(result))
114
+
115
+ def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
116
+ raw_label, explanation = (
117
+ _parse_label_from_explanation(unparsed_response),
118
+ unparsed_response,
119
+ )
120
+ label = snap_to_rail(raw_label, ["true", "false"])
121
+ if label == "true":
122
+ score = 1.0
123
+ elif label == "false":
124
+ score = 0.0
125
+ else:
126
+ raise RuntimeError(f"Could not parse LLM evaluation: {unparsed_response}")
127
+ return EvaluationResult(
128
+ score=score,
129
+ explanation=explanation,
130
+ metadata={},
131
+ )
132
+
133
+ @classmethod
134
+ def _format_base_template(cls, criteria: str, description: Optional[str] = None) -> str:
135
+ formatted_description = cls._description.format(criteria=criteria, description=description)
136
+ formatted_template = cls._base_template.format(
137
+ criteria=criteria,
138
+ description=formatted_description,
139
+ text="{text}", # leave the text field as a placeholder
140
+ )
141
+ return formatted_template
142
+
143
+
144
+ def criteria_evaluator_factory(
145
+ class_name: str, criteria: str, description: str
146
+ ) -> Type[ExperimentEvaluator]:
147
+ return type(
148
+ class_name,
149
+ (LLMCriteriaEvaluator,),
150
+ {
151
+ "__init__": lambda self, model: LLMCriteriaEvaluator.__init__(
152
+ self, model, criteria, description, name=class_name
153
+ ),
154
+ "__module__": __name__,
155
+ "name": class_name,
156
+ "template": LLMCriteriaEvaluator._format_base_template(criteria, description),
157
+ },
158
+ )
159
+
160
+
161
+ LLMConcisenessEvaluator = criteria_evaluator_factory(
162
+ class_name="LLMConcisenessEvaluator",
163
+ criteria="concise",
164
+ description="is just a few sentences and easy to follow",
165
+ )
166
+
167
+
168
+ LLMHelpfulnessEvaluator = criteria_evaluator_factory(
169
+ class_name="LLMHelpfulnessEvaluator",
170
+ criteria="helpful",
171
+ description="provides useful information",
172
+ )
173
+
174
+
175
+ LLMCoherenceEvaluator = criteria_evaluator_factory(
176
+ class_name="LLMCoherenceEvaluator",
177
+ criteria="coherent",
178
+ description="is coherent, well-structured, and organized",
179
+ )
180
+
181
+
182
+ def _parse_label_from_explanation(raw_string: str) -> str:
183
+ label_delimiter = r"(\W*label\W*)"
184
+ parts = re.split(label_delimiter, raw_string, flags=re.IGNORECASE)
185
+ if len(parts) > 1:
186
+ # Find the last occurrence of the delimiter and take the part after it
187
+ last_index = len(parts) - 1
188
+ while last_index > 0:
189
+ if re.match(label_delimiter, parts[last_index - 1], flags=re.IGNORECASE):
190
+ return parts[last_index].strip()
191
+ last_index -= 1
192
+ return raw_string
193
+
194
+
195
+ class RelevanceEvaluator:
196
+ annotator_kind = "LLM"
197
+ template = (
198
+ "Determine if the following response is relevant to the query. In this context, "
199
+ "'relevance' means that the response directly addresses the core question or topic of the "
200
+ "query. First, explain step-by-step why you think the text is or is not relevant. "
201
+ "Then provide a single word label; 'true' if the text is relevant or 'false' if the text "
202
+ "is not relevant. "
203
+ "Here is an example template for your reponse:\n\n"
204
+ "CRITERIA: the response is 'relevant' to the query\n"
205
+ "QUERY: *text that contains a query*\n"
206
+ "RESPONSE: *a response that may or may not be relevant to the query*\n"
207
+ "EXPLANATION: *a step by step explanation of your reasoning for whether or not the "
208
+ "response is relevant to the query*\n"
209
+ "LABEL: *true or false*\n\n"
210
+ "Follow this template for the following example:\n\n"
211
+ "CRITERIA: the response is 'relevant' to the query\n"
212
+ "QUERY: {reference}\n"
213
+ "RESPONSE: {submission}\n"
214
+ "EXPLANATION: "
215
+ )
216
+
217
+ def __init__(
218
+ self,
219
+ model: LLMBaseModel,
220
+ get_query: Optional[Callable[[Example, ExperimentRun], str]] = None,
221
+ get_response: Optional[Callable[[Example, ExperimentRun], str]] = None,
222
+ name: str = "RelevanceEvaluator",
223
+ ):
224
+ self.model = model
225
+ self.name = name
226
+ self.get_query = get_query or self._default_get_query
227
+ self.get_response = get_response or self._default_get_response
228
+
229
+ def _format_eval_template(self, example: Example, experiment_run: ExperimentRun) -> str:
230
+ assert experiment_run.output is not None
231
+ query = self.get_query(example, experiment_run)
232
+ response = self.get_response(example, experiment_run)
233
+ return self.template.format(query=query, response=response)
234
+
235
+ def _parse_eval_output(self, unparsed_response: str) -> EvaluationResult:
236
+ raw_label, explanation = (
237
+ _parse_label_from_explanation(unparsed_response),
238
+ unparsed_response,
239
+ )
240
+ label = snap_to_rail(raw_label, ["true", "false"])
241
+ if label == "true":
242
+ score = 1.0
243
+ elif label == "false":
244
+ score = 0.0
245
+ else:
246
+ raise RuntimeError(f"Could not parse LLM evaluation: {unparsed_response}")
247
+ return EvaluationResult(
248
+ score=score,
249
+ explanation=explanation,
250
+ metadata={},
251
+ )
252
+
253
+ def _default_get_query(self, example: Example, experiment_run: ExperimentRun) -> str:
254
+ return str(example.input)
255
+
256
+ def _default_get_response(self, example: Example, experiment_run: ExperimentRun) -> str:
257
+ assert experiment_run.output is not None
258
+ return str(_unwrap_json(experiment_run.output.result))
259
+
260
+ def evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
261
+ formatted_template = self._format_eval_template(example, exp_run)
262
+ unparsed_response = self.model._generate(formatted_template)
263
+ return self._parse_eval_output(unparsed_response)
264
+
265
+ async def async_evaluate(self, example: Example, exp_run: ExperimentRun) -> EvaluationResult:
266
+ formatted_template = self._format_eval_template(example, exp_run)
267
+ unparsed_response = await self.model._async_generate(formatted_template)
268
+ return self._parse_eval_output(unparsed_response)
269
+
270
+
271
+ # Someday we'll do typing checking in unit tests.
272
+ if TYPE_CHECKING:
273
+ _: ExperimentEvaluator
274
+ _ = JSONParsable()
275
+ _ = ContainsKeyword("test")