arize-phoenix 3.19.4__py3-none-any.whl → 3.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

@@ -0,0 +1,525 @@
1
+ import logging
2
+ import uuid
3
+ from copy import deepcopy
4
+ from dataclasses import fields, replace
5
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from pandas import DataFrame, Series, Timestamp, read_parquet
10
+ from pandas.api.types import (
11
+ is_numeric_dtype,
12
+ )
13
+ from typing_extensions import TypeAlias
14
+
15
+ from phoenix.config import DATASET_DIR, GENERATED_DATASET_NAME_PREFIX
16
+ from phoenix.datetime_utils import normalize_timestamps
17
+
18
+ from . import errors as err
19
+ from .schema import (
20
+ LLM_SCHEMA_FIELD_NAMES,
21
+ MULTI_COLUMN_SCHEMA_FIELD_NAMES,
22
+ SINGLE_COLUMN_SCHEMA_FIELD_NAMES,
23
+ EmbeddingColumnNames,
24
+ EmbeddingFeatures,
25
+ Schema,
26
+ SchemaFieldName,
27
+ SchemaFieldValue,
28
+ )
29
+ from .validation import validate_dataset_inputs
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ # A schema like object. Not recommended to use this directly
34
+ SchemaLike: TypeAlias = Any
35
+
36
+
37
+ class Inferences:
38
+ """
39
+ A dataset to use for analysis using phoenix.
40
+ Used to construct a phoenix session via px.launch_app
41
+
42
+ Parameters
43
+ ----------
44
+ dataframe : pandas.DataFrame
45
+ The pandas dataframe containing the data to analyze
46
+ schema : phoenix.Schema
47
+ the schema of the dataset. Maps dataframe columns to the appropriate
48
+ model inference dimensions (features, predictions, actuals).
49
+ name : str, optional
50
+ The name of the dataset. If not provided, a random name will be generated.
51
+ Is helpful for identifying the dataset in the application.
52
+
53
+ Returns
54
+ -------
55
+ dataset : Dataset
56
+ The dataset object that can be used in a phoenix session
57
+
58
+ Examples
59
+ --------
60
+ >>> primary_dataset = px.Inferences(
61
+ >>> dataframe=production_dataframe, schema=schema, name="primary"
62
+ >>> )
63
+ """
64
+
65
+ _data_file_name: str = "data.parquet"
66
+ _schema_file_name: str = "schema.json"
67
+ _is_persisted: bool = False
68
+ _is_empty: bool = False
69
+
70
+ def __init__(
71
+ self,
72
+ dataframe: DataFrame,
73
+ schema: Union[Schema, SchemaLike],
74
+ name: Optional[str] = None,
75
+ ):
76
+ # allow for schema like objects
77
+ if not isinstance(schema, Schema):
78
+ schema = _get_schema_from_unknown_schema_param(schema)
79
+ errors = validate_dataset_inputs(
80
+ dataframe=dataframe,
81
+ schema=schema,
82
+ )
83
+ if errors:
84
+ raise err.DatasetError(errors)
85
+ dataframe, schema = _parse_dataframe_and_schema(dataframe, schema)
86
+ dataframe, schema = _normalize_timestamps(
87
+ dataframe, schema, default_timestamp=Timestamp.utcnow()
88
+ )
89
+ dataframe = _sort_dataframe_rows_by_timestamp(dataframe, schema)
90
+ self.__dataframe: DataFrame = dataframe
91
+ self.__schema: Schema = schema
92
+ self.__name: str = (
93
+ name if name is not None else f"{GENERATED_DATASET_NAME_PREFIX}{str(uuid.uuid4())}"
94
+ )
95
+ self._is_empty = self.dataframe.empty
96
+ logger.info(f"""Dataset: {self.__name} initialized""")
97
+
98
+ def __repr__(self) -> str:
99
+ return f'<Dataset "{self.name}">'
100
+
101
+ @property
102
+ def dataframe(self) -> DataFrame:
103
+ return self.__dataframe
104
+
105
+ @property
106
+ def schema(self) -> "Schema":
107
+ return self.__schema
108
+
109
+ @property
110
+ def name(self) -> str:
111
+ return self.__name
112
+
113
+ @classmethod
114
+ def from_name(cls, name: str) -> "Inferences":
115
+ """Retrieves a dataset by name from the file system"""
116
+ directory = DATASET_DIR / name
117
+ df = read_parquet(directory / cls._data_file_name)
118
+ with open(directory / cls._schema_file_name) as schema_file:
119
+ schema_json = schema_file.read()
120
+ schema = Schema.from_json(schema_json)
121
+ return cls(df, schema, name)
122
+
123
+ def to_disc(self) -> None:
124
+ """writes the data and schema to disc"""
125
+ directory = DATASET_DIR / self.name
126
+ directory.mkdir(parents=True, exist_ok=True)
127
+ self.dataframe.to_parquet(
128
+ directory / self._data_file_name,
129
+ allow_truncated_timestamps=True,
130
+ coerce_timestamps="ms",
131
+ )
132
+ schema_json_data = self.schema.to_json()
133
+ with open(directory / self._schema_file_name, "w+") as schema_file:
134
+ schema_file.write(schema_json_data)
135
+
136
+
137
+ def _parse_dataframe_and_schema(dataframe: DataFrame, schema: Schema) -> Tuple[DataFrame, Schema]:
138
+ """
139
+ Parses a dataframe according to a schema, infers feature columns names when
140
+ they are not explicitly provided, and removes excluded column names from
141
+ both dataframe and schema.
142
+
143
+ Removes column names in `schema.excluded_column_names` from the input dataframe and schema. To
144
+ remove an embedding feature and all associated columns, add the name of the embedding feature to
145
+ `schema.excluded_column_names` rather than the associated column names. If
146
+ `schema.feature_column_names` is `None`, automatically discovers features by adding all column
147
+ names present in the dataframe but not included in any other schema fields.
148
+ """
149
+
150
+ unseen_excluded_column_names: Set[str] = (
151
+ set(schema.excluded_column_names) if schema.excluded_column_names is not None else set()
152
+ )
153
+ unseen_column_names: Set[str] = set(dataframe.columns.to_list())
154
+ column_name_to_include: Dict[str, bool] = {}
155
+ schema_patch: Dict[SchemaFieldName, SchemaFieldValue] = {}
156
+
157
+ for schema_field_name in SINGLE_COLUMN_SCHEMA_FIELD_NAMES:
158
+ _check_single_column_schema_field_for_excluded_columns(
159
+ schema,
160
+ schema_field_name,
161
+ unseen_excluded_column_names,
162
+ schema_patch,
163
+ column_name_to_include,
164
+ unseen_column_names,
165
+ )
166
+
167
+ for schema_field_name in MULTI_COLUMN_SCHEMA_FIELD_NAMES:
168
+ _check_multi_column_schema_field_for_excluded_columns(
169
+ schema,
170
+ schema_field_name,
171
+ unseen_excluded_column_names,
172
+ schema_patch,
173
+ column_name_to_include,
174
+ unseen_column_names,
175
+ )
176
+
177
+ if schema.embedding_feature_column_names:
178
+ _check_embedding_features_schema_field_for_excluded_columns(
179
+ schema.embedding_feature_column_names,
180
+ unseen_excluded_column_names,
181
+ schema_patch,
182
+ column_name_to_include,
183
+ unseen_column_names,
184
+ )
185
+
186
+ for llm_schema_field_name in LLM_SCHEMA_FIELD_NAMES:
187
+ embedding_column_name_mapping = getattr(schema, llm_schema_field_name)
188
+ if isinstance(embedding_column_name_mapping, EmbeddingColumnNames):
189
+ _check_embedding_column_names_for_excluded_columns(
190
+ embedding_column_name_mapping,
191
+ column_name_to_include,
192
+ unseen_column_names,
193
+ )
194
+
195
+ if not schema.feature_column_names and unseen_column_names:
196
+ _discover_feature_columns(
197
+ dataframe,
198
+ unseen_excluded_column_names,
199
+ schema_patch,
200
+ column_name_to_include,
201
+ unseen_column_names,
202
+ )
203
+
204
+ if unseen_excluded_column_names:
205
+ logger.warning(
206
+ "The following columns and embedding features were excluded in the schema but were "
207
+ "not found in the dataframe: {}".format(", ".join(unseen_excluded_column_names))
208
+ )
209
+
210
+ parsed_dataframe, parsed_schema = _create_and_normalize_dataframe_and_schema(
211
+ dataframe, schema, schema_patch, column_name_to_include
212
+ )
213
+
214
+ return parsed_dataframe, parsed_schema
215
+
216
+
217
+ def _check_single_column_schema_field_for_excluded_columns(
218
+ schema: Schema,
219
+ schema_field_name: str,
220
+ unseen_excluded_column_names: Set[str],
221
+ schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
222
+ column_name_to_include: Dict[str, bool],
223
+ unseen_column_names: Set[str],
224
+ ) -> None:
225
+ """
226
+ Checks single-column schema fields for excluded column names.
227
+ """
228
+ column_name: str = getattr(schema, schema_field_name)
229
+ include_column: bool = column_name not in unseen_excluded_column_names
230
+ column_name_to_include[column_name] = include_column
231
+ if not include_column:
232
+ schema_patch[schema_field_name] = None
233
+ unseen_excluded_column_names.discard(column_name)
234
+ logger.debug(f"excluded {schema_field_name}: {column_name}")
235
+ unseen_column_names.discard(column_name)
236
+
237
+
238
+ def _check_multi_column_schema_field_for_excluded_columns(
239
+ schema: Schema,
240
+ schema_field_name: str,
241
+ unseen_excluded_column_names: Set[str],
242
+ schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
243
+ column_name_to_include: Dict[str, bool],
244
+ unseen_column_names: Set[str],
245
+ ) -> None:
246
+ """
247
+ Checks multi-column schema fields for excluded columns names.
248
+ """
249
+ column_names: Optional[List[str]] = getattr(schema, schema_field_name)
250
+ if column_names:
251
+ included_column_names: List[str] = []
252
+ excluded_column_names: List[str] = []
253
+ for column_name in column_names:
254
+ is_included_column = column_name not in unseen_excluded_column_names
255
+ column_name_to_include[column_name] = is_included_column
256
+ if is_included_column:
257
+ included_column_names.append(column_name)
258
+ else:
259
+ excluded_column_names.append(column_name)
260
+ unseen_excluded_column_names.discard(column_name)
261
+ logger.debug(f"excluded {schema_field_name}: {column_name}")
262
+ unseen_column_names.discard(column_name)
263
+ schema_patch[schema_field_name] = included_column_names if included_column_names else None
264
+
265
+
266
+ def _check_embedding_features_schema_field_for_excluded_columns(
267
+ embedding_features: EmbeddingFeatures,
268
+ unseen_excluded_column_names: Set[str],
269
+ schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
270
+ column_name_to_include: Dict[str, bool],
271
+ unseen_column_names: Set[str],
272
+ ) -> None:
273
+ """
274
+ Check embedding features for excluded column names.
275
+ """
276
+ included_embedding_features: EmbeddingFeatures = {}
277
+ for (
278
+ embedding_feature_name,
279
+ embedding_column_name_mapping,
280
+ ) in embedding_features.items():
281
+ include_embedding_feature = embedding_feature_name not in unseen_excluded_column_names
282
+ if include_embedding_feature:
283
+ included_embedding_features[embedding_feature_name] = deepcopy(
284
+ embedding_column_name_mapping
285
+ )
286
+ else:
287
+ unseen_excluded_column_names.discard(embedding_feature_name)
288
+
289
+ for embedding_field in fields(embedding_column_name_mapping):
290
+ column_name: Optional[str] = getattr(
291
+ embedding_column_name_mapping, embedding_field.name
292
+ )
293
+ if column_name is not None:
294
+ column_name_to_include[column_name] = include_embedding_feature
295
+ if (
296
+ column_name != embedding_feature_name
297
+ and column_name in unseen_excluded_column_names
298
+ ):
299
+ logger.warning(
300
+ f"Excluding embedding feature columns such as "
301
+ f'"{column_name}" has no effect; instead exclude the '
302
+ f'corresponding embedding feature name "{embedding_feature_name}".'
303
+ )
304
+ unseen_excluded_column_names.discard(column_name)
305
+ unseen_column_names.discard(column_name)
306
+ schema_patch["embedding_feature_column_names"] = (
307
+ included_embedding_features if included_embedding_features else None
308
+ )
309
+
310
+
311
+ def _check_embedding_column_names_for_excluded_columns(
312
+ embedding_column_name_mapping: EmbeddingColumnNames,
313
+ column_name_to_include: Dict[str, bool],
314
+ unseen_column_names: Set[str],
315
+ ) -> None:
316
+ """
317
+ Check embedding column names for excluded column names.
318
+ """
319
+ for embedding_field in fields(embedding_column_name_mapping):
320
+ column_name: Optional[str] = getattr(embedding_column_name_mapping, embedding_field.name)
321
+ if column_name is not None:
322
+ column_name_to_include[column_name] = True
323
+ unseen_column_names.discard(column_name)
324
+
325
+
326
+ def _discover_feature_columns(
327
+ dataframe: DataFrame,
328
+ unseen_excluded_column_names: Set[str],
329
+ schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
330
+ column_name_to_include: Dict[str, bool],
331
+ unseen_column_names: Set[str],
332
+ ) -> None:
333
+ """
334
+ Adds unseen and un-excluded columns as features, with the exception of "prediction_id"
335
+ which is reserved
336
+ """
337
+ discovered_feature_column_names = []
338
+ for column_name in unseen_column_names:
339
+ if column_name not in unseen_excluded_column_names and column_name != "prediction_id":
340
+ discovered_feature_column_names.append(column_name)
341
+ column_name_to_include[column_name] = True
342
+ else:
343
+ unseen_excluded_column_names.discard(column_name)
344
+ logger.debug(f"excluded feature: {column_name}")
345
+ original_column_positions: List[int] = dataframe.columns.get_indexer(
346
+ discovered_feature_column_names
347
+ ) # type: ignore
348
+ feature_column_name_to_position: Dict[str, int] = dict(
349
+ zip(discovered_feature_column_names, original_column_positions)
350
+ )
351
+ discovered_feature_column_names.sort(key=lambda col: feature_column_name_to_position[col])
352
+ schema_patch["feature_column_names"] = discovered_feature_column_names
353
+ logger.debug(
354
+ "Discovered feature column names: {}".format(", ".join(discovered_feature_column_names))
355
+ )
356
+
357
+
358
+ def _create_and_normalize_dataframe_and_schema(
359
+ dataframe: DataFrame,
360
+ schema: Schema,
361
+ schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
362
+ column_name_to_include: Dict[str, bool],
363
+ ) -> Tuple[DataFrame, Schema]:
364
+ """
365
+ Creates new dataframe and schema objects to reflect excluded column names
366
+ and discovered features. This also normalizes dataframe columns to ensure a
367
+ standard set of columns (i.e. timestamp and prediction_id) and datatypes for
368
+ those columns.
369
+ """
370
+ included_column_names: List[str] = []
371
+ for column_name in dataframe.columns:
372
+ if column_name_to_include.get(str(column_name), False):
373
+ included_column_names.append(str(column_name))
374
+ parsed_dataframe = dataframe[included_column_names].copy()
375
+ parsed_schema = replace(schema, excluded_column_names=None, **schema_patch) # type: ignore
376
+ pred_id_col_name = parsed_schema.prediction_id_column_name
377
+ if pred_id_col_name is None:
378
+ parsed_schema = replace(parsed_schema, prediction_id_column_name="prediction_id")
379
+ parsed_dataframe["prediction_id"] = _add_prediction_id(len(parsed_dataframe))
380
+ elif is_numeric_dtype(parsed_dataframe.dtypes[pred_id_col_name]):
381
+ parsed_dataframe[pred_id_col_name] = parsed_dataframe[pred_id_col_name].astype(str)
382
+ for embedding in (
383
+ [parsed_schema.prompt_column_names, parsed_schema.response_column_names]
384
+ + list(parsed_schema.embedding_feature_column_names.values())
385
+ if parsed_schema.embedding_feature_column_names is not None
386
+ else []
387
+ ):
388
+ if not isinstance(embedding, EmbeddingColumnNames):
389
+ continue
390
+ vector_column_name = embedding.vector_column_name
391
+ if vector_column_name not in parsed_dataframe.columns:
392
+ continue
393
+ parsed_dataframe.loc[:, vector_column_name] = _coerce_vectors_as_arrays_if_necessary(
394
+ parsed_dataframe.loc[:, vector_column_name],
395
+ vector_column_name,
396
+ )
397
+ return parsed_dataframe, parsed_schema
398
+
399
+
400
+ def _coerce_vectors_as_arrays_if_necessary(
401
+ series: "pd.Series[Any]",
402
+ column_name: str,
403
+ ) -> "pd.Series[Any]":
404
+ not_na = ~series.isna()
405
+ if not_na.sum() == 0:
406
+ return series
407
+ if invalid_types := set(map(type, series.loc[not_na])) - {np.ndarray}:
408
+ logger.warning(
409
+ f"converting items in column `{column_name}` to numpy.ndarray, "
410
+ f"because they have the following "
411
+ f"type{'s' if len(invalid_types) > 1 else ''}: "
412
+ f"{', '.join(map(lambda t: t.__name__, invalid_types))}"
413
+ )
414
+ return series.mask(not_na, series.loc[not_na].apply(np.array))
415
+ return series
416
+
417
+
418
+ def _sort_dataframe_rows_by_timestamp(dataframe: DataFrame, schema: Schema) -> DataFrame:
419
+ """
420
+ Sorts dataframe rows by timestamp.
421
+ """
422
+ timestamp_column_name = schema.timestamp_column_name
423
+ if timestamp_column_name is None:
424
+ raise ValueError("Schema must specify a timestamp column name.")
425
+ dataframe.set_index(timestamp_column_name, drop=False, inplace=True)
426
+ dataframe.sort_index(inplace=True)
427
+ return dataframe
428
+
429
+
430
+ def _normalize_timestamps(
431
+ dataframe: DataFrame,
432
+ schema: Schema,
433
+ default_timestamp: Timestamp,
434
+ ) -> Tuple[DataFrame, Schema]:
435
+ """
436
+ Ensures that the dataframe has a timestamp column and the schema has a timestamp field. If the
437
+ input dataframe contains a Unix or datetime timestamp or ISO8601 timestamp strings column, it
438
+ is converted to UTC timezone-aware timestamp. If the input dataframe and schema do not contain
439
+ timestamps, the default timestamp is used. If a timestamp is timezone-naive, it is localized
440
+ as per local timezone and then converted to UTC
441
+ """
442
+ timestamp_column: Series[Timestamp]
443
+ if (timestamp_column_name := schema.timestamp_column_name) is None:
444
+ timestamp_column_name = "timestamp"
445
+ schema = replace(schema, timestamp_column_name=timestamp_column_name)
446
+ timestamp_column = (
447
+ Series([default_timestamp] * len(dataframe), index=dataframe.index)
448
+ if len(dataframe)
449
+ else Series([default_timestamp]).iloc[:0].set_axis(dataframe.index, axis=0)
450
+ )
451
+ else:
452
+ timestamp_column = normalize_timestamps(
453
+ dataframe[timestamp_column_name],
454
+ )
455
+ dataframe[timestamp_column_name] = timestamp_column
456
+ return dataframe, schema
457
+
458
+
459
+ def _get_schema_from_unknown_schema_param(schemaLike: SchemaLike) -> Schema:
460
+ """
461
+ Compatibility function for converting from arize.utils.types.Schema to phoenix.inferences.Schema
462
+ """
463
+ try:
464
+ from arize.utils.types import (
465
+ EmbeddingColumnNames as ArizeEmbeddingColumnNames, # fmt: off type: ignore
466
+ )
467
+ from arize.utils.types import Schema as ArizeSchema
468
+
469
+ if not isinstance(schemaLike, ArizeSchema):
470
+ raise ValueError("Unknown schema passed to Dataset. Please pass a phoenix Schema")
471
+
472
+ embedding_feature_column_names: Dict[str, EmbeddingColumnNames] = {}
473
+ if schemaLike.embedding_feature_column_names is not None:
474
+ for (
475
+ embedding_name,
476
+ arize_embedding_feature_column_names,
477
+ ) in schemaLike.embedding_feature_column_names.items():
478
+ if isinstance(arize_embedding_feature_column_names, ArizeEmbeddingColumnNames):
479
+ embedding_feature_column_names[embedding_name] = EmbeddingColumnNames(
480
+ vector_column_name=arize_embedding_feature_column_names.vector_column_name,
481
+ link_to_data_column_name=arize_embedding_feature_column_names.link_to_data_column_name,
482
+ raw_data_column_name=arize_embedding_feature_column_names.data_column_name,
483
+ )
484
+ prompt_column_names: Optional[EmbeddingColumnNames] = None
485
+ if schemaLike.prompt_column_names is not None and isinstance(
486
+ schemaLike.prompt_column_names, ArizeEmbeddingColumnNames
487
+ ):
488
+ prompt_column_names = EmbeddingColumnNames(
489
+ vector_column_name=schemaLike.prompt_column_names.vector_column_name,
490
+ raw_data_column_name=schemaLike.prompt_column_names.data_column_name,
491
+ link_to_data_column_name=schemaLike.prompt_column_names.link_to_data_column_name,
492
+ )
493
+ response_column_names: Optional[EmbeddingColumnNames] = None
494
+ if schemaLike.response_column_names is not None and isinstance(
495
+ schemaLike.response_column_names, ArizeEmbeddingColumnNames
496
+ ):
497
+ response_column_names = EmbeddingColumnNames(
498
+ vector_column_name=schemaLike.response_column_names.vector_column_name,
499
+ raw_data_column_name=schemaLike.response_column_names.data_column_name,
500
+ link_to_data_column_name=schemaLike.response_column_names.link_to_data_column_name,
501
+ )
502
+ return Schema(
503
+ feature_column_names=schemaLike.feature_column_names,
504
+ tag_column_names=schemaLike.tag_column_names,
505
+ prediction_label_column_name=schemaLike.prediction_label_column_name,
506
+ actual_label_column_name=schemaLike.actual_label_column_name,
507
+ prediction_id_column_name=schemaLike.prediction_id_column_name,
508
+ timestamp_column_name=schemaLike.timestamp_column_name,
509
+ embedding_feature_column_names=embedding_feature_column_names,
510
+ prompt_column_names=prompt_column_names,
511
+ response_column_names=response_column_names,
512
+ )
513
+ except Exception:
514
+ raise ValueError(
515
+ """Unsupported Arize Schema. Please pass a phoenix Schema or update
516
+ to the latest version of Arize."""
517
+ )
518
+
519
+
520
+ def _add_prediction_id(num_rows: int) -> List[str]:
521
+ return [str(uuid.uuid4()) for _ in range(num_rows)]
522
+
523
+
524
+ # A dataset with no data. Useful for stubs
525
+ EMPTY_DATASET = Inferences(pd.DataFrame(), schema=Schema())
@@ -0,0 +1,151 @@
1
+ import json
2
+ from dataclasses import asdict, dataclass, replace
3
+ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
4
+
5
+ EmbeddingFeatures = Dict[str, "EmbeddingColumnNames"]
6
+ SchemaFieldName = str
7
+ SchemaFieldValue = Union[Optional[str], Optional[List[str]], Optional[EmbeddingFeatures]]
8
+
9
+ MULTI_COLUMN_SCHEMA_FIELD_NAMES: Tuple[str, ...] = ("feature_column_names", "tag_column_names")
10
+ SINGLE_COLUMN_SCHEMA_FIELD_NAMES: Tuple[str, ...] = (
11
+ "prediction_id_column_name",
12
+ "timestamp_column_name",
13
+ "prediction_label_column_name",
14
+ "prediction_score_column_name",
15
+ "actual_label_column_name",
16
+ "actual_score_column_name",
17
+ )
18
+ LLM_SCHEMA_FIELD_NAMES = ["prompt_column_names", "response_column_names"]
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class EmbeddingColumnNames(Dict[str, Any]):
23
+ """
24
+ A dataclass to hold the column names for the embedding features.
25
+ An embedding feature is a feature that is represented by a vector.
26
+ The vector is a representation of unstructured data, such as text or an image
27
+ """
28
+
29
+ vector_column_name: str
30
+ raw_data_column_name: Optional[str] = None
31
+ link_to_data_column_name: Optional[str] = None
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class RetrievalEmbeddingColumnNames(EmbeddingColumnNames):
36
+ """
37
+ A relationship is a column that maps a prediction to another record.
38
+
39
+ Example
40
+ -------
41
+ For example, in context retrieval from a vector store, a query is
42
+ embedded and used to search for relevant records in a vector store.
43
+ In this case you would add a column to the dataset that maps the query
44
+ to the vector store records. E.x. [document_1, document_5, document_3]
45
+
46
+ A table view of the primary dataset could look like this:
47
+
48
+ | query | retrieved_document_ids | document_relevance_scores |
49
+ |-------|------------------------|---------------------------|
50
+ | ... | [doc_1, doc_5, doc_3] | [0.4567, 0.3456, 0.2345] |
51
+ | ... | [doc_1, doc_6, doc_2] | [0.7890, 0.6789, 0.5678] |
52
+ | ... | [doc_1, doc_6, doc_9] | [0.9012, 0.8901, 0.0123] |
53
+
54
+
55
+ The corresponding vector store dataset would look like this:
56
+
57
+ | id | embedding_vector | document_text |
58
+ |----------|------------------|---------------|
59
+ | doc_1 | ... | lorem ipsum |
60
+ | doc_2 | ... | lorem ipsum |
61
+ | doc_3 | ... | lorem ipsum |
62
+
63
+
64
+ To declare this relationship in the schema, you would configure the schema as follows:
65
+
66
+ >>> schema = Schema(
67
+ ... prompt_column_names=RetrievalEmbeddingColumnNames(
68
+ ... context_retrieval_ids_column_name="retrieved_document_ids",
69
+ ... context_retrieval_scores_column_name="document_relevance_scores",
70
+ ... )
71
+ ...)
72
+ """
73
+
74
+ context_retrieval_ids_column_name: Optional[str] = None
75
+ context_retrieval_scores_column_name: Optional[str] = None
76
+
77
+
78
+ @dataclass(frozen=True)
79
+ class Schema:
80
+ prediction_id_column_name: Optional[str] = None
81
+ id_column_name: Optional[str] = None # Syntax sugar for prediction_id_column_name
82
+ timestamp_column_name: Optional[str] = None
83
+ feature_column_names: Optional[List[str]] = None
84
+ tag_column_names: Optional[List[str]] = None
85
+ prediction_label_column_name: Optional[str] = None
86
+ prediction_score_column_name: Optional[str] = None
87
+ actual_label_column_name: Optional[str] = None
88
+ actual_score_column_name: Optional[str] = None
89
+ prompt_column_names: Optional[Union[EmbeddingColumnNames, RetrievalEmbeddingColumnNames]] = None
90
+ response_column_names: Optional[Union[str, EmbeddingColumnNames]] = None
91
+ # document_column_names is used explicitly when the schema is used to capture a corpus
92
+ document_column_names: Optional[EmbeddingColumnNames] = None
93
+ embedding_feature_column_names: Optional[EmbeddingFeatures] = None
94
+ excluded_column_names: Optional[List[str]] = None
95
+
96
+ def __post_init__(self) -> None:
97
+ # re-map document_column_names to be in the prompt_column_names position
98
+ # This is a shortcut to leverage the same schema for model and corpus datasets
99
+ if self.document_column_names is not None:
100
+ object.__setattr__(self, "prompt_column_names", self.document_column_names)
101
+ object.__setattr__(self, "document_column_names", None)
102
+
103
+ if self.id_column_name is not None:
104
+ object.__setattr__(self, "prediction_id_column_name", self.id_column_name)
105
+ object.__setattr__(self, "id_column_name", None)
106
+
107
+ def replace(self, **changes: Any) -> "Schema":
108
+ return replace(self, **changes)
109
+
110
+ def asdict(self) -> Dict[str, str]:
111
+ return asdict(self)
112
+
113
+ def to_json(self) -> str:
114
+ "Converts the schema to a dict for JSON serialization"
115
+ return json.dumps(asdict(self))
116
+
117
+ @classmethod
118
+ def from_json(cls, json_string: str) -> "Schema":
119
+ json_data = json.loads(json_string)
120
+
121
+ # parse embedding_feature_column_names
122
+ if json_data.get("embedding_feature_column_names") is not None:
123
+ embedding_feature_column_names = {}
124
+ for feature_name, column_names in json_data["embedding_feature_column_names"].items():
125
+ embedding_feature_column_names[feature_name] = EmbeddingColumnNames(
126
+ vector_column_name=column_names["vector_column_name"],
127
+ raw_data_column_name=column_names["raw_data_column_name"],
128
+ link_to_data_column_name=column_names["link_to_data_column_name"],
129
+ )
130
+ json_data["embedding_feature_column_names"] = embedding_feature_column_names
131
+
132
+ # parse prompt_column_names
133
+ if (prompt := json_data.get("prompt_column_names")) is not None:
134
+ json_data["prompt_column_names"] = RetrievalEmbeddingColumnNames(
135
+ vector_column_name=prompt.get("vector_column_name"),
136
+ raw_data_column_name=prompt.get("raw_data_column_name"),
137
+ context_retrieval_ids_column_name=prompt.get("context_retrieval_ids_column_name"),
138
+ context_retrieval_scores_column_name=prompt.get(
139
+ "context_retrieval_scores_column_name"
140
+ ),
141
+ )
142
+
143
+ # parse response_column_names
144
+ if isinstance(json_data.get("response_column_names"), Mapping):
145
+ response_column_names = EmbeddingColumnNames(
146
+ vector_column_name=json_data["response_column_names"]["vector_column_name"],
147
+ raw_data_column_name=json_data["response_column_names"]["raw_data_column_name"],
148
+ )
149
+ json_data["response_column_names"] = response_column_names
150
+
151
+ return cls(**json_data)
phoenix/server/app.py CHANGED
@@ -142,6 +142,10 @@ async def version(_: Request) -> PlainTextResponse:
142
142
  return PlainTextResponse(f"{phoenix.__version__}")
143
143
 
144
144
 
145
+ async def check_healthz(_: Request) -> PlainTextResponse:
146
+ return PlainTextResponse("OK")
147
+
148
+
145
149
  def create_app(
146
150
  export_path: Path,
147
151
  model: Model,
@@ -193,6 +197,7 @@ def create_app(
193
197
  )
194
198
  + [
195
199
  Route("/arize_phoenix_version", version),
200
+ Route("/healthz", check_healthz),
196
201
  Route(
197
202
  "/exports",
198
203
  type(