arize-phoenix 3.19.4__py3-none-any.whl → 3.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.20.0.dist-info}/METADATA +1 -1
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.20.0.dist-info}/RECORD +23 -18
- phoenix/__init__.py +7 -3
- phoenix/core/model.py +8 -6
- phoenix/core/model_schema_adapter.py +6 -6
- phoenix/datasets/dataset.py +9 -521
- phoenix/datasets/fixtures.py +16 -552
- phoenix/datasets/schema.py +24 -145
- phoenix/inferences/__init__.py +0 -0
- phoenix/inferences/fixtures.py +560 -0
- phoenix/inferences/inferences.py +525 -0
- phoenix/inferences/schema.py +151 -0
- phoenix/server/app.py +5 -0
- phoenix/server/main.py +8 -8
- phoenix/session/evaluation.py +1 -2
- phoenix/session/session.py +16 -16
- phoenix/utilities/deprecation.py +30 -0
- phoenix/version.py +1 -1
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.20.0.dist-info}/WHEEL +0 -0
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.20.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.20.0.dist-info}/licenses/LICENSE +0 -0
- /phoenix/{datasets → inferences}/errors.py +0 -0
- /phoenix/{datasets → inferences}/validation.py +0 -0
|
@@ -0,0 +1,525 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import uuid
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
from dataclasses import fields, replace
|
|
5
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from pandas import DataFrame, Series, Timestamp, read_parquet
|
|
10
|
+
from pandas.api.types import (
|
|
11
|
+
is_numeric_dtype,
|
|
12
|
+
)
|
|
13
|
+
from typing_extensions import TypeAlias
|
|
14
|
+
|
|
15
|
+
from phoenix.config import DATASET_DIR, GENERATED_DATASET_NAME_PREFIX
|
|
16
|
+
from phoenix.datetime_utils import normalize_timestamps
|
|
17
|
+
|
|
18
|
+
from . import errors as err
|
|
19
|
+
from .schema import (
|
|
20
|
+
LLM_SCHEMA_FIELD_NAMES,
|
|
21
|
+
MULTI_COLUMN_SCHEMA_FIELD_NAMES,
|
|
22
|
+
SINGLE_COLUMN_SCHEMA_FIELD_NAMES,
|
|
23
|
+
EmbeddingColumnNames,
|
|
24
|
+
EmbeddingFeatures,
|
|
25
|
+
Schema,
|
|
26
|
+
SchemaFieldName,
|
|
27
|
+
SchemaFieldValue,
|
|
28
|
+
)
|
|
29
|
+
from .validation import validate_dataset_inputs
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
# A schema like object. Not recommended to use this directly
|
|
34
|
+
SchemaLike: TypeAlias = Any
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class Inferences:
|
|
38
|
+
"""
|
|
39
|
+
A dataset to use for analysis using phoenix.
|
|
40
|
+
Used to construct a phoenix session via px.launch_app
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
dataframe : pandas.DataFrame
|
|
45
|
+
The pandas dataframe containing the data to analyze
|
|
46
|
+
schema : phoenix.Schema
|
|
47
|
+
the schema of the dataset. Maps dataframe columns to the appropriate
|
|
48
|
+
model inference dimensions (features, predictions, actuals).
|
|
49
|
+
name : str, optional
|
|
50
|
+
The name of the dataset. If not provided, a random name will be generated.
|
|
51
|
+
Is helpful for identifying the dataset in the application.
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
dataset : Dataset
|
|
56
|
+
The dataset object that can be used in a phoenix session
|
|
57
|
+
|
|
58
|
+
Examples
|
|
59
|
+
--------
|
|
60
|
+
>>> primary_dataset = px.Inferences(
|
|
61
|
+
>>> dataframe=production_dataframe, schema=schema, name="primary"
|
|
62
|
+
>>> )
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
_data_file_name: str = "data.parquet"
|
|
66
|
+
_schema_file_name: str = "schema.json"
|
|
67
|
+
_is_persisted: bool = False
|
|
68
|
+
_is_empty: bool = False
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
dataframe: DataFrame,
|
|
73
|
+
schema: Union[Schema, SchemaLike],
|
|
74
|
+
name: Optional[str] = None,
|
|
75
|
+
):
|
|
76
|
+
# allow for schema like objects
|
|
77
|
+
if not isinstance(schema, Schema):
|
|
78
|
+
schema = _get_schema_from_unknown_schema_param(schema)
|
|
79
|
+
errors = validate_dataset_inputs(
|
|
80
|
+
dataframe=dataframe,
|
|
81
|
+
schema=schema,
|
|
82
|
+
)
|
|
83
|
+
if errors:
|
|
84
|
+
raise err.DatasetError(errors)
|
|
85
|
+
dataframe, schema = _parse_dataframe_and_schema(dataframe, schema)
|
|
86
|
+
dataframe, schema = _normalize_timestamps(
|
|
87
|
+
dataframe, schema, default_timestamp=Timestamp.utcnow()
|
|
88
|
+
)
|
|
89
|
+
dataframe = _sort_dataframe_rows_by_timestamp(dataframe, schema)
|
|
90
|
+
self.__dataframe: DataFrame = dataframe
|
|
91
|
+
self.__schema: Schema = schema
|
|
92
|
+
self.__name: str = (
|
|
93
|
+
name if name is not None else f"{GENERATED_DATASET_NAME_PREFIX}{str(uuid.uuid4())}"
|
|
94
|
+
)
|
|
95
|
+
self._is_empty = self.dataframe.empty
|
|
96
|
+
logger.info(f"""Dataset: {self.__name} initialized""")
|
|
97
|
+
|
|
98
|
+
def __repr__(self) -> str:
|
|
99
|
+
return f'<Dataset "{self.name}">'
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def dataframe(self) -> DataFrame:
|
|
103
|
+
return self.__dataframe
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def schema(self) -> "Schema":
|
|
107
|
+
return self.__schema
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def name(self) -> str:
|
|
111
|
+
return self.__name
|
|
112
|
+
|
|
113
|
+
@classmethod
|
|
114
|
+
def from_name(cls, name: str) -> "Inferences":
|
|
115
|
+
"""Retrieves a dataset by name from the file system"""
|
|
116
|
+
directory = DATASET_DIR / name
|
|
117
|
+
df = read_parquet(directory / cls._data_file_name)
|
|
118
|
+
with open(directory / cls._schema_file_name) as schema_file:
|
|
119
|
+
schema_json = schema_file.read()
|
|
120
|
+
schema = Schema.from_json(schema_json)
|
|
121
|
+
return cls(df, schema, name)
|
|
122
|
+
|
|
123
|
+
def to_disc(self) -> None:
|
|
124
|
+
"""writes the data and schema to disc"""
|
|
125
|
+
directory = DATASET_DIR / self.name
|
|
126
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
127
|
+
self.dataframe.to_parquet(
|
|
128
|
+
directory / self._data_file_name,
|
|
129
|
+
allow_truncated_timestamps=True,
|
|
130
|
+
coerce_timestamps="ms",
|
|
131
|
+
)
|
|
132
|
+
schema_json_data = self.schema.to_json()
|
|
133
|
+
with open(directory / self._schema_file_name, "w+") as schema_file:
|
|
134
|
+
schema_file.write(schema_json_data)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _parse_dataframe_and_schema(dataframe: DataFrame, schema: Schema) -> Tuple[DataFrame, Schema]:
|
|
138
|
+
"""
|
|
139
|
+
Parses a dataframe according to a schema, infers feature columns names when
|
|
140
|
+
they are not explicitly provided, and removes excluded column names from
|
|
141
|
+
both dataframe and schema.
|
|
142
|
+
|
|
143
|
+
Removes column names in `schema.excluded_column_names` from the input dataframe and schema. To
|
|
144
|
+
remove an embedding feature and all associated columns, add the name of the embedding feature to
|
|
145
|
+
`schema.excluded_column_names` rather than the associated column names. If
|
|
146
|
+
`schema.feature_column_names` is `None`, automatically discovers features by adding all column
|
|
147
|
+
names present in the dataframe but not included in any other schema fields.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
unseen_excluded_column_names: Set[str] = (
|
|
151
|
+
set(schema.excluded_column_names) if schema.excluded_column_names is not None else set()
|
|
152
|
+
)
|
|
153
|
+
unseen_column_names: Set[str] = set(dataframe.columns.to_list())
|
|
154
|
+
column_name_to_include: Dict[str, bool] = {}
|
|
155
|
+
schema_patch: Dict[SchemaFieldName, SchemaFieldValue] = {}
|
|
156
|
+
|
|
157
|
+
for schema_field_name in SINGLE_COLUMN_SCHEMA_FIELD_NAMES:
|
|
158
|
+
_check_single_column_schema_field_for_excluded_columns(
|
|
159
|
+
schema,
|
|
160
|
+
schema_field_name,
|
|
161
|
+
unseen_excluded_column_names,
|
|
162
|
+
schema_patch,
|
|
163
|
+
column_name_to_include,
|
|
164
|
+
unseen_column_names,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
for schema_field_name in MULTI_COLUMN_SCHEMA_FIELD_NAMES:
|
|
168
|
+
_check_multi_column_schema_field_for_excluded_columns(
|
|
169
|
+
schema,
|
|
170
|
+
schema_field_name,
|
|
171
|
+
unseen_excluded_column_names,
|
|
172
|
+
schema_patch,
|
|
173
|
+
column_name_to_include,
|
|
174
|
+
unseen_column_names,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
if schema.embedding_feature_column_names:
|
|
178
|
+
_check_embedding_features_schema_field_for_excluded_columns(
|
|
179
|
+
schema.embedding_feature_column_names,
|
|
180
|
+
unseen_excluded_column_names,
|
|
181
|
+
schema_patch,
|
|
182
|
+
column_name_to_include,
|
|
183
|
+
unseen_column_names,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
for llm_schema_field_name in LLM_SCHEMA_FIELD_NAMES:
|
|
187
|
+
embedding_column_name_mapping = getattr(schema, llm_schema_field_name)
|
|
188
|
+
if isinstance(embedding_column_name_mapping, EmbeddingColumnNames):
|
|
189
|
+
_check_embedding_column_names_for_excluded_columns(
|
|
190
|
+
embedding_column_name_mapping,
|
|
191
|
+
column_name_to_include,
|
|
192
|
+
unseen_column_names,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
if not schema.feature_column_names and unseen_column_names:
|
|
196
|
+
_discover_feature_columns(
|
|
197
|
+
dataframe,
|
|
198
|
+
unseen_excluded_column_names,
|
|
199
|
+
schema_patch,
|
|
200
|
+
column_name_to_include,
|
|
201
|
+
unseen_column_names,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
if unseen_excluded_column_names:
|
|
205
|
+
logger.warning(
|
|
206
|
+
"The following columns and embedding features were excluded in the schema but were "
|
|
207
|
+
"not found in the dataframe: {}".format(", ".join(unseen_excluded_column_names))
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
parsed_dataframe, parsed_schema = _create_and_normalize_dataframe_and_schema(
|
|
211
|
+
dataframe, schema, schema_patch, column_name_to_include
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
return parsed_dataframe, parsed_schema
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _check_single_column_schema_field_for_excluded_columns(
|
|
218
|
+
schema: Schema,
|
|
219
|
+
schema_field_name: str,
|
|
220
|
+
unseen_excluded_column_names: Set[str],
|
|
221
|
+
schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
|
|
222
|
+
column_name_to_include: Dict[str, bool],
|
|
223
|
+
unseen_column_names: Set[str],
|
|
224
|
+
) -> None:
|
|
225
|
+
"""
|
|
226
|
+
Checks single-column schema fields for excluded column names.
|
|
227
|
+
"""
|
|
228
|
+
column_name: str = getattr(schema, schema_field_name)
|
|
229
|
+
include_column: bool = column_name not in unseen_excluded_column_names
|
|
230
|
+
column_name_to_include[column_name] = include_column
|
|
231
|
+
if not include_column:
|
|
232
|
+
schema_patch[schema_field_name] = None
|
|
233
|
+
unseen_excluded_column_names.discard(column_name)
|
|
234
|
+
logger.debug(f"excluded {schema_field_name}: {column_name}")
|
|
235
|
+
unseen_column_names.discard(column_name)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _check_multi_column_schema_field_for_excluded_columns(
|
|
239
|
+
schema: Schema,
|
|
240
|
+
schema_field_name: str,
|
|
241
|
+
unseen_excluded_column_names: Set[str],
|
|
242
|
+
schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
|
|
243
|
+
column_name_to_include: Dict[str, bool],
|
|
244
|
+
unseen_column_names: Set[str],
|
|
245
|
+
) -> None:
|
|
246
|
+
"""
|
|
247
|
+
Checks multi-column schema fields for excluded columns names.
|
|
248
|
+
"""
|
|
249
|
+
column_names: Optional[List[str]] = getattr(schema, schema_field_name)
|
|
250
|
+
if column_names:
|
|
251
|
+
included_column_names: List[str] = []
|
|
252
|
+
excluded_column_names: List[str] = []
|
|
253
|
+
for column_name in column_names:
|
|
254
|
+
is_included_column = column_name not in unseen_excluded_column_names
|
|
255
|
+
column_name_to_include[column_name] = is_included_column
|
|
256
|
+
if is_included_column:
|
|
257
|
+
included_column_names.append(column_name)
|
|
258
|
+
else:
|
|
259
|
+
excluded_column_names.append(column_name)
|
|
260
|
+
unseen_excluded_column_names.discard(column_name)
|
|
261
|
+
logger.debug(f"excluded {schema_field_name}: {column_name}")
|
|
262
|
+
unseen_column_names.discard(column_name)
|
|
263
|
+
schema_patch[schema_field_name] = included_column_names if included_column_names else None
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _check_embedding_features_schema_field_for_excluded_columns(
|
|
267
|
+
embedding_features: EmbeddingFeatures,
|
|
268
|
+
unseen_excluded_column_names: Set[str],
|
|
269
|
+
schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
|
|
270
|
+
column_name_to_include: Dict[str, bool],
|
|
271
|
+
unseen_column_names: Set[str],
|
|
272
|
+
) -> None:
|
|
273
|
+
"""
|
|
274
|
+
Check embedding features for excluded column names.
|
|
275
|
+
"""
|
|
276
|
+
included_embedding_features: EmbeddingFeatures = {}
|
|
277
|
+
for (
|
|
278
|
+
embedding_feature_name,
|
|
279
|
+
embedding_column_name_mapping,
|
|
280
|
+
) in embedding_features.items():
|
|
281
|
+
include_embedding_feature = embedding_feature_name not in unseen_excluded_column_names
|
|
282
|
+
if include_embedding_feature:
|
|
283
|
+
included_embedding_features[embedding_feature_name] = deepcopy(
|
|
284
|
+
embedding_column_name_mapping
|
|
285
|
+
)
|
|
286
|
+
else:
|
|
287
|
+
unseen_excluded_column_names.discard(embedding_feature_name)
|
|
288
|
+
|
|
289
|
+
for embedding_field in fields(embedding_column_name_mapping):
|
|
290
|
+
column_name: Optional[str] = getattr(
|
|
291
|
+
embedding_column_name_mapping, embedding_field.name
|
|
292
|
+
)
|
|
293
|
+
if column_name is not None:
|
|
294
|
+
column_name_to_include[column_name] = include_embedding_feature
|
|
295
|
+
if (
|
|
296
|
+
column_name != embedding_feature_name
|
|
297
|
+
and column_name in unseen_excluded_column_names
|
|
298
|
+
):
|
|
299
|
+
logger.warning(
|
|
300
|
+
f"Excluding embedding feature columns such as "
|
|
301
|
+
f'"{column_name}" has no effect; instead exclude the '
|
|
302
|
+
f'corresponding embedding feature name "{embedding_feature_name}".'
|
|
303
|
+
)
|
|
304
|
+
unseen_excluded_column_names.discard(column_name)
|
|
305
|
+
unseen_column_names.discard(column_name)
|
|
306
|
+
schema_patch["embedding_feature_column_names"] = (
|
|
307
|
+
included_embedding_features if included_embedding_features else None
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _check_embedding_column_names_for_excluded_columns(
|
|
312
|
+
embedding_column_name_mapping: EmbeddingColumnNames,
|
|
313
|
+
column_name_to_include: Dict[str, bool],
|
|
314
|
+
unseen_column_names: Set[str],
|
|
315
|
+
) -> None:
|
|
316
|
+
"""
|
|
317
|
+
Check embedding column names for excluded column names.
|
|
318
|
+
"""
|
|
319
|
+
for embedding_field in fields(embedding_column_name_mapping):
|
|
320
|
+
column_name: Optional[str] = getattr(embedding_column_name_mapping, embedding_field.name)
|
|
321
|
+
if column_name is not None:
|
|
322
|
+
column_name_to_include[column_name] = True
|
|
323
|
+
unseen_column_names.discard(column_name)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _discover_feature_columns(
|
|
327
|
+
dataframe: DataFrame,
|
|
328
|
+
unseen_excluded_column_names: Set[str],
|
|
329
|
+
schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
|
|
330
|
+
column_name_to_include: Dict[str, bool],
|
|
331
|
+
unseen_column_names: Set[str],
|
|
332
|
+
) -> None:
|
|
333
|
+
"""
|
|
334
|
+
Adds unseen and un-excluded columns as features, with the exception of "prediction_id"
|
|
335
|
+
which is reserved
|
|
336
|
+
"""
|
|
337
|
+
discovered_feature_column_names = []
|
|
338
|
+
for column_name in unseen_column_names:
|
|
339
|
+
if column_name not in unseen_excluded_column_names and column_name != "prediction_id":
|
|
340
|
+
discovered_feature_column_names.append(column_name)
|
|
341
|
+
column_name_to_include[column_name] = True
|
|
342
|
+
else:
|
|
343
|
+
unseen_excluded_column_names.discard(column_name)
|
|
344
|
+
logger.debug(f"excluded feature: {column_name}")
|
|
345
|
+
original_column_positions: List[int] = dataframe.columns.get_indexer(
|
|
346
|
+
discovered_feature_column_names
|
|
347
|
+
) # type: ignore
|
|
348
|
+
feature_column_name_to_position: Dict[str, int] = dict(
|
|
349
|
+
zip(discovered_feature_column_names, original_column_positions)
|
|
350
|
+
)
|
|
351
|
+
discovered_feature_column_names.sort(key=lambda col: feature_column_name_to_position[col])
|
|
352
|
+
schema_patch["feature_column_names"] = discovered_feature_column_names
|
|
353
|
+
logger.debug(
|
|
354
|
+
"Discovered feature column names: {}".format(", ".join(discovered_feature_column_names))
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _create_and_normalize_dataframe_and_schema(
|
|
359
|
+
dataframe: DataFrame,
|
|
360
|
+
schema: Schema,
|
|
361
|
+
schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
|
|
362
|
+
column_name_to_include: Dict[str, bool],
|
|
363
|
+
) -> Tuple[DataFrame, Schema]:
|
|
364
|
+
"""
|
|
365
|
+
Creates new dataframe and schema objects to reflect excluded column names
|
|
366
|
+
and discovered features. This also normalizes dataframe columns to ensure a
|
|
367
|
+
standard set of columns (i.e. timestamp and prediction_id) and datatypes for
|
|
368
|
+
those columns.
|
|
369
|
+
"""
|
|
370
|
+
included_column_names: List[str] = []
|
|
371
|
+
for column_name in dataframe.columns:
|
|
372
|
+
if column_name_to_include.get(str(column_name), False):
|
|
373
|
+
included_column_names.append(str(column_name))
|
|
374
|
+
parsed_dataframe = dataframe[included_column_names].copy()
|
|
375
|
+
parsed_schema = replace(schema, excluded_column_names=None, **schema_patch) # type: ignore
|
|
376
|
+
pred_id_col_name = parsed_schema.prediction_id_column_name
|
|
377
|
+
if pred_id_col_name is None:
|
|
378
|
+
parsed_schema = replace(parsed_schema, prediction_id_column_name="prediction_id")
|
|
379
|
+
parsed_dataframe["prediction_id"] = _add_prediction_id(len(parsed_dataframe))
|
|
380
|
+
elif is_numeric_dtype(parsed_dataframe.dtypes[pred_id_col_name]):
|
|
381
|
+
parsed_dataframe[pred_id_col_name] = parsed_dataframe[pred_id_col_name].astype(str)
|
|
382
|
+
for embedding in (
|
|
383
|
+
[parsed_schema.prompt_column_names, parsed_schema.response_column_names]
|
|
384
|
+
+ list(parsed_schema.embedding_feature_column_names.values())
|
|
385
|
+
if parsed_schema.embedding_feature_column_names is not None
|
|
386
|
+
else []
|
|
387
|
+
):
|
|
388
|
+
if not isinstance(embedding, EmbeddingColumnNames):
|
|
389
|
+
continue
|
|
390
|
+
vector_column_name = embedding.vector_column_name
|
|
391
|
+
if vector_column_name not in parsed_dataframe.columns:
|
|
392
|
+
continue
|
|
393
|
+
parsed_dataframe.loc[:, vector_column_name] = _coerce_vectors_as_arrays_if_necessary(
|
|
394
|
+
parsed_dataframe.loc[:, vector_column_name],
|
|
395
|
+
vector_column_name,
|
|
396
|
+
)
|
|
397
|
+
return parsed_dataframe, parsed_schema
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _coerce_vectors_as_arrays_if_necessary(
|
|
401
|
+
series: "pd.Series[Any]",
|
|
402
|
+
column_name: str,
|
|
403
|
+
) -> "pd.Series[Any]":
|
|
404
|
+
not_na = ~series.isna()
|
|
405
|
+
if not_na.sum() == 0:
|
|
406
|
+
return series
|
|
407
|
+
if invalid_types := set(map(type, series.loc[not_na])) - {np.ndarray}:
|
|
408
|
+
logger.warning(
|
|
409
|
+
f"converting items in column `{column_name}` to numpy.ndarray, "
|
|
410
|
+
f"because they have the following "
|
|
411
|
+
f"type{'s' if len(invalid_types) > 1 else ''}: "
|
|
412
|
+
f"{', '.join(map(lambda t: t.__name__, invalid_types))}"
|
|
413
|
+
)
|
|
414
|
+
return series.mask(not_na, series.loc[not_na].apply(np.array))
|
|
415
|
+
return series
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _sort_dataframe_rows_by_timestamp(dataframe: DataFrame, schema: Schema) -> DataFrame:
|
|
419
|
+
"""
|
|
420
|
+
Sorts dataframe rows by timestamp.
|
|
421
|
+
"""
|
|
422
|
+
timestamp_column_name = schema.timestamp_column_name
|
|
423
|
+
if timestamp_column_name is None:
|
|
424
|
+
raise ValueError("Schema must specify a timestamp column name.")
|
|
425
|
+
dataframe.set_index(timestamp_column_name, drop=False, inplace=True)
|
|
426
|
+
dataframe.sort_index(inplace=True)
|
|
427
|
+
return dataframe
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def _normalize_timestamps(
|
|
431
|
+
dataframe: DataFrame,
|
|
432
|
+
schema: Schema,
|
|
433
|
+
default_timestamp: Timestamp,
|
|
434
|
+
) -> Tuple[DataFrame, Schema]:
|
|
435
|
+
"""
|
|
436
|
+
Ensures that the dataframe has a timestamp column and the schema has a timestamp field. If the
|
|
437
|
+
input dataframe contains a Unix or datetime timestamp or ISO8601 timestamp strings column, it
|
|
438
|
+
is converted to UTC timezone-aware timestamp. If the input dataframe and schema do not contain
|
|
439
|
+
timestamps, the default timestamp is used. If a timestamp is timezone-naive, it is localized
|
|
440
|
+
as per local timezone and then converted to UTC
|
|
441
|
+
"""
|
|
442
|
+
timestamp_column: Series[Timestamp]
|
|
443
|
+
if (timestamp_column_name := schema.timestamp_column_name) is None:
|
|
444
|
+
timestamp_column_name = "timestamp"
|
|
445
|
+
schema = replace(schema, timestamp_column_name=timestamp_column_name)
|
|
446
|
+
timestamp_column = (
|
|
447
|
+
Series([default_timestamp] * len(dataframe), index=dataframe.index)
|
|
448
|
+
if len(dataframe)
|
|
449
|
+
else Series([default_timestamp]).iloc[:0].set_axis(dataframe.index, axis=0)
|
|
450
|
+
)
|
|
451
|
+
else:
|
|
452
|
+
timestamp_column = normalize_timestamps(
|
|
453
|
+
dataframe[timestamp_column_name],
|
|
454
|
+
)
|
|
455
|
+
dataframe[timestamp_column_name] = timestamp_column
|
|
456
|
+
return dataframe, schema
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def _get_schema_from_unknown_schema_param(schemaLike: SchemaLike) -> Schema:
|
|
460
|
+
"""
|
|
461
|
+
Compatibility function for converting from arize.utils.types.Schema to phoenix.inferences.Schema
|
|
462
|
+
"""
|
|
463
|
+
try:
|
|
464
|
+
from arize.utils.types import (
|
|
465
|
+
EmbeddingColumnNames as ArizeEmbeddingColumnNames, # fmt: off type: ignore
|
|
466
|
+
)
|
|
467
|
+
from arize.utils.types import Schema as ArizeSchema
|
|
468
|
+
|
|
469
|
+
if not isinstance(schemaLike, ArizeSchema):
|
|
470
|
+
raise ValueError("Unknown schema passed to Dataset. Please pass a phoenix Schema")
|
|
471
|
+
|
|
472
|
+
embedding_feature_column_names: Dict[str, EmbeddingColumnNames] = {}
|
|
473
|
+
if schemaLike.embedding_feature_column_names is not None:
|
|
474
|
+
for (
|
|
475
|
+
embedding_name,
|
|
476
|
+
arize_embedding_feature_column_names,
|
|
477
|
+
) in schemaLike.embedding_feature_column_names.items():
|
|
478
|
+
if isinstance(arize_embedding_feature_column_names, ArizeEmbeddingColumnNames):
|
|
479
|
+
embedding_feature_column_names[embedding_name] = EmbeddingColumnNames(
|
|
480
|
+
vector_column_name=arize_embedding_feature_column_names.vector_column_name,
|
|
481
|
+
link_to_data_column_name=arize_embedding_feature_column_names.link_to_data_column_name,
|
|
482
|
+
raw_data_column_name=arize_embedding_feature_column_names.data_column_name,
|
|
483
|
+
)
|
|
484
|
+
prompt_column_names: Optional[EmbeddingColumnNames] = None
|
|
485
|
+
if schemaLike.prompt_column_names is not None and isinstance(
|
|
486
|
+
schemaLike.prompt_column_names, ArizeEmbeddingColumnNames
|
|
487
|
+
):
|
|
488
|
+
prompt_column_names = EmbeddingColumnNames(
|
|
489
|
+
vector_column_name=schemaLike.prompt_column_names.vector_column_name,
|
|
490
|
+
raw_data_column_name=schemaLike.prompt_column_names.data_column_name,
|
|
491
|
+
link_to_data_column_name=schemaLike.prompt_column_names.link_to_data_column_name,
|
|
492
|
+
)
|
|
493
|
+
response_column_names: Optional[EmbeddingColumnNames] = None
|
|
494
|
+
if schemaLike.response_column_names is not None and isinstance(
|
|
495
|
+
schemaLike.response_column_names, ArizeEmbeddingColumnNames
|
|
496
|
+
):
|
|
497
|
+
response_column_names = EmbeddingColumnNames(
|
|
498
|
+
vector_column_name=schemaLike.response_column_names.vector_column_name,
|
|
499
|
+
raw_data_column_name=schemaLike.response_column_names.data_column_name,
|
|
500
|
+
link_to_data_column_name=schemaLike.response_column_names.link_to_data_column_name,
|
|
501
|
+
)
|
|
502
|
+
return Schema(
|
|
503
|
+
feature_column_names=schemaLike.feature_column_names,
|
|
504
|
+
tag_column_names=schemaLike.tag_column_names,
|
|
505
|
+
prediction_label_column_name=schemaLike.prediction_label_column_name,
|
|
506
|
+
actual_label_column_name=schemaLike.actual_label_column_name,
|
|
507
|
+
prediction_id_column_name=schemaLike.prediction_id_column_name,
|
|
508
|
+
timestamp_column_name=schemaLike.timestamp_column_name,
|
|
509
|
+
embedding_feature_column_names=embedding_feature_column_names,
|
|
510
|
+
prompt_column_names=prompt_column_names,
|
|
511
|
+
response_column_names=response_column_names,
|
|
512
|
+
)
|
|
513
|
+
except Exception:
|
|
514
|
+
raise ValueError(
|
|
515
|
+
"""Unsupported Arize Schema. Please pass a phoenix Schema or update
|
|
516
|
+
to the latest version of Arize."""
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def _add_prediction_id(num_rows: int) -> List[str]:
|
|
521
|
+
return [str(uuid.uuid4()) for _ in range(num_rows)]
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
# A dataset with no data. Useful for stubs
|
|
525
|
+
EMPTY_DATASET = Inferences(pd.DataFrame(), schema=Schema())
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import asdict, dataclass, replace
|
|
3
|
+
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
EmbeddingFeatures = Dict[str, "EmbeddingColumnNames"]
|
|
6
|
+
SchemaFieldName = str
|
|
7
|
+
SchemaFieldValue = Union[Optional[str], Optional[List[str]], Optional[EmbeddingFeatures]]
|
|
8
|
+
|
|
9
|
+
MULTI_COLUMN_SCHEMA_FIELD_NAMES: Tuple[str, ...] = ("feature_column_names", "tag_column_names")
|
|
10
|
+
SINGLE_COLUMN_SCHEMA_FIELD_NAMES: Tuple[str, ...] = (
|
|
11
|
+
"prediction_id_column_name",
|
|
12
|
+
"timestamp_column_name",
|
|
13
|
+
"prediction_label_column_name",
|
|
14
|
+
"prediction_score_column_name",
|
|
15
|
+
"actual_label_column_name",
|
|
16
|
+
"actual_score_column_name",
|
|
17
|
+
)
|
|
18
|
+
LLM_SCHEMA_FIELD_NAMES = ["prompt_column_names", "response_column_names"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class EmbeddingColumnNames(Dict[str, Any]):
|
|
23
|
+
"""
|
|
24
|
+
A dataclass to hold the column names for the embedding features.
|
|
25
|
+
An embedding feature is a feature that is represented by a vector.
|
|
26
|
+
The vector is a representation of unstructured data, such as text or an image
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
vector_column_name: str
|
|
30
|
+
raw_data_column_name: Optional[str] = None
|
|
31
|
+
link_to_data_column_name: Optional[str] = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True)
|
|
35
|
+
class RetrievalEmbeddingColumnNames(EmbeddingColumnNames):
|
|
36
|
+
"""
|
|
37
|
+
A relationship is a column that maps a prediction to another record.
|
|
38
|
+
|
|
39
|
+
Example
|
|
40
|
+
-------
|
|
41
|
+
For example, in context retrieval from a vector store, a query is
|
|
42
|
+
embedded and used to search for relevant records in a vector store.
|
|
43
|
+
In this case you would add a column to the dataset that maps the query
|
|
44
|
+
to the vector store records. E.x. [document_1, document_5, document_3]
|
|
45
|
+
|
|
46
|
+
A table view of the primary dataset could look like this:
|
|
47
|
+
|
|
48
|
+
| query | retrieved_document_ids | document_relevance_scores |
|
|
49
|
+
|-------|------------------------|---------------------------|
|
|
50
|
+
| ... | [doc_1, doc_5, doc_3] | [0.4567, 0.3456, 0.2345] |
|
|
51
|
+
| ... | [doc_1, doc_6, doc_2] | [0.7890, 0.6789, 0.5678] |
|
|
52
|
+
| ... | [doc_1, doc_6, doc_9] | [0.9012, 0.8901, 0.0123] |
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
The corresponding vector store dataset would look like this:
|
|
56
|
+
|
|
57
|
+
| id | embedding_vector | document_text |
|
|
58
|
+
|----------|------------------|---------------|
|
|
59
|
+
| doc_1 | ... | lorem ipsum |
|
|
60
|
+
| doc_2 | ... | lorem ipsum |
|
|
61
|
+
| doc_3 | ... | lorem ipsum |
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
To declare this relationship in the schema, you would configure the schema as follows:
|
|
65
|
+
|
|
66
|
+
>>> schema = Schema(
|
|
67
|
+
... prompt_column_names=RetrievalEmbeddingColumnNames(
|
|
68
|
+
... context_retrieval_ids_column_name="retrieved_document_ids",
|
|
69
|
+
... context_retrieval_scores_column_name="document_relevance_scores",
|
|
70
|
+
... )
|
|
71
|
+
...)
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
context_retrieval_ids_column_name: Optional[str] = None
|
|
75
|
+
context_retrieval_scores_column_name: Optional[str] = None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass(frozen=True)
|
|
79
|
+
class Schema:
|
|
80
|
+
prediction_id_column_name: Optional[str] = None
|
|
81
|
+
id_column_name: Optional[str] = None # Syntax sugar for prediction_id_column_name
|
|
82
|
+
timestamp_column_name: Optional[str] = None
|
|
83
|
+
feature_column_names: Optional[List[str]] = None
|
|
84
|
+
tag_column_names: Optional[List[str]] = None
|
|
85
|
+
prediction_label_column_name: Optional[str] = None
|
|
86
|
+
prediction_score_column_name: Optional[str] = None
|
|
87
|
+
actual_label_column_name: Optional[str] = None
|
|
88
|
+
actual_score_column_name: Optional[str] = None
|
|
89
|
+
prompt_column_names: Optional[Union[EmbeddingColumnNames, RetrievalEmbeddingColumnNames]] = None
|
|
90
|
+
response_column_names: Optional[Union[str, EmbeddingColumnNames]] = None
|
|
91
|
+
# document_column_names is used explicitly when the schema is used to capture a corpus
|
|
92
|
+
document_column_names: Optional[EmbeddingColumnNames] = None
|
|
93
|
+
embedding_feature_column_names: Optional[EmbeddingFeatures] = None
|
|
94
|
+
excluded_column_names: Optional[List[str]] = None
|
|
95
|
+
|
|
96
|
+
def __post_init__(self) -> None:
|
|
97
|
+
# re-map document_column_names to be in the prompt_column_names position
|
|
98
|
+
# This is a shortcut to leverage the same schema for model and corpus datasets
|
|
99
|
+
if self.document_column_names is not None:
|
|
100
|
+
object.__setattr__(self, "prompt_column_names", self.document_column_names)
|
|
101
|
+
object.__setattr__(self, "document_column_names", None)
|
|
102
|
+
|
|
103
|
+
if self.id_column_name is not None:
|
|
104
|
+
object.__setattr__(self, "prediction_id_column_name", self.id_column_name)
|
|
105
|
+
object.__setattr__(self, "id_column_name", None)
|
|
106
|
+
|
|
107
|
+
def replace(self, **changes: Any) -> "Schema":
|
|
108
|
+
return replace(self, **changes)
|
|
109
|
+
|
|
110
|
+
def asdict(self) -> Dict[str, str]:
|
|
111
|
+
return asdict(self)
|
|
112
|
+
|
|
113
|
+
def to_json(self) -> str:
|
|
114
|
+
"Converts the schema to a dict for JSON serialization"
|
|
115
|
+
return json.dumps(asdict(self))
|
|
116
|
+
|
|
117
|
+
@classmethod
|
|
118
|
+
def from_json(cls, json_string: str) -> "Schema":
|
|
119
|
+
json_data = json.loads(json_string)
|
|
120
|
+
|
|
121
|
+
# parse embedding_feature_column_names
|
|
122
|
+
if json_data.get("embedding_feature_column_names") is not None:
|
|
123
|
+
embedding_feature_column_names = {}
|
|
124
|
+
for feature_name, column_names in json_data["embedding_feature_column_names"].items():
|
|
125
|
+
embedding_feature_column_names[feature_name] = EmbeddingColumnNames(
|
|
126
|
+
vector_column_name=column_names["vector_column_name"],
|
|
127
|
+
raw_data_column_name=column_names["raw_data_column_name"],
|
|
128
|
+
link_to_data_column_name=column_names["link_to_data_column_name"],
|
|
129
|
+
)
|
|
130
|
+
json_data["embedding_feature_column_names"] = embedding_feature_column_names
|
|
131
|
+
|
|
132
|
+
# parse prompt_column_names
|
|
133
|
+
if (prompt := json_data.get("prompt_column_names")) is not None:
|
|
134
|
+
json_data["prompt_column_names"] = RetrievalEmbeddingColumnNames(
|
|
135
|
+
vector_column_name=prompt.get("vector_column_name"),
|
|
136
|
+
raw_data_column_name=prompt.get("raw_data_column_name"),
|
|
137
|
+
context_retrieval_ids_column_name=prompt.get("context_retrieval_ids_column_name"),
|
|
138
|
+
context_retrieval_scores_column_name=prompt.get(
|
|
139
|
+
"context_retrieval_scores_column_name"
|
|
140
|
+
),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# parse response_column_names
|
|
144
|
+
if isinstance(json_data.get("response_column_names"), Mapping):
|
|
145
|
+
response_column_names = EmbeddingColumnNames(
|
|
146
|
+
vector_column_name=json_data["response_column_names"]["vector_column_name"],
|
|
147
|
+
raw_data_column_name=json_data["response_column_names"]["raw_data_column_name"],
|
|
148
|
+
)
|
|
149
|
+
json_data["response_column_names"] = response_column_names
|
|
150
|
+
|
|
151
|
+
return cls(**json_data)
|
phoenix/server/app.py
CHANGED
|
@@ -142,6 +142,10 @@ async def version(_: Request) -> PlainTextResponse:
|
|
|
142
142
|
return PlainTextResponse(f"{phoenix.__version__}")
|
|
143
143
|
|
|
144
144
|
|
|
145
|
+
async def check_healthz(_: Request) -> PlainTextResponse:
|
|
146
|
+
return PlainTextResponse("OK")
|
|
147
|
+
|
|
148
|
+
|
|
145
149
|
def create_app(
|
|
146
150
|
export_path: Path,
|
|
147
151
|
model: Model,
|
|
@@ -193,6 +197,7 @@ def create_app(
|
|
|
193
197
|
)
|
|
194
198
|
+ [
|
|
195
199
|
Route("/arize_phoenix_version", version),
|
|
200
|
+
Route("/healthz", check_healthz),
|
|
196
201
|
Route(
|
|
197
202
|
"/exports",
|
|
198
203
|
type(
|