arize-phoenix 3.19.4__py3-none-any.whl → 3.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.21.0.dist-info}/METADATA +7 -7
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.21.0.dist-info}/RECORD +23 -18
- phoenix/__init__.py +7 -3
- phoenix/core/model.py +8 -6
- phoenix/core/model_schema_adapter.py +6 -6
- phoenix/datasets/dataset.py +9 -521
- phoenix/datasets/fixtures.py +16 -552
- phoenix/datasets/schema.py +24 -145
- phoenix/inferences/__init__.py +0 -0
- phoenix/inferences/fixtures.py +560 -0
- phoenix/inferences/inferences.py +730 -0
- phoenix/inferences/schema.py +151 -0
- phoenix/server/app.py +5 -0
- phoenix/server/main.py +8 -8
- phoenix/session/evaluation.py +1 -2
- phoenix/session/session.py +23 -23
- phoenix/utilities/deprecation.py +30 -0
- phoenix/version.py +1 -1
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.21.0.dist-info}/WHEEL +0 -0
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.21.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.21.0.dist-info}/licenses/LICENSE +0 -0
- /phoenix/{datasets → inferences}/errors.py +0 -0
- /phoenix/{datasets → inferences}/validation.py +0 -0
phoenix/datasets/dataset.py
CHANGED
|
@@ -1,128 +1,20 @@
|
|
|
1
|
-
import logging
|
|
2
1
|
import re
|
|
3
|
-
import
|
|
4
|
-
from copy import deepcopy
|
|
5
|
-
from dataclasses import dataclass, fields, replace
|
|
2
|
+
from dataclasses import dataclass, replace
|
|
6
3
|
from enum import Enum
|
|
7
4
|
from itertools import groupby
|
|
8
|
-
from typing import
|
|
5
|
+
from typing import Dict
|
|
9
6
|
|
|
10
|
-
|
|
11
|
-
import pandas as pd
|
|
12
|
-
from pandas import DataFrame, Series, Timestamp, read_parquet
|
|
13
|
-
from pandas.api.types import (
|
|
14
|
-
is_numeric_dtype,
|
|
15
|
-
)
|
|
16
|
-
from typing_extensions import TypeAlias
|
|
7
|
+
from pandas import DataFrame
|
|
17
8
|
|
|
18
|
-
from phoenix.
|
|
19
|
-
from phoenix.
|
|
9
|
+
from phoenix.inferences.inferences import Inferences
|
|
10
|
+
from phoenix.inferences.schema import EmbeddingColumnNames, RetrievalEmbeddingColumnNames, Schema
|
|
11
|
+
from phoenix.utilities.deprecation import deprecated, deprecated_class
|
|
20
12
|
|
|
21
|
-
from . import errors as err
|
|
22
|
-
from .schema import (
|
|
23
|
-
LLM_SCHEMA_FIELD_NAMES,
|
|
24
|
-
MULTI_COLUMN_SCHEMA_FIELD_NAMES,
|
|
25
|
-
SINGLE_COLUMN_SCHEMA_FIELD_NAMES,
|
|
26
|
-
EmbeddingColumnNames,
|
|
27
|
-
EmbeddingFeatures,
|
|
28
|
-
RetrievalEmbeddingColumnNames,
|
|
29
|
-
Schema,
|
|
30
|
-
SchemaFieldName,
|
|
31
|
-
SchemaFieldValue,
|
|
32
|
-
)
|
|
33
|
-
from .validation import validate_dataset_inputs
|
|
34
|
-
|
|
35
|
-
logger = logging.getLogger(__name__)
|
|
36
|
-
|
|
37
|
-
# A schema like object. Not recommended to use this directly
|
|
38
|
-
SchemaLike: TypeAlias = Any
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
class Dataset:
|
|
42
|
-
"""
|
|
43
|
-
A dataset to use for analysis using phoenix.
|
|
44
|
-
Used to construct a phoenix session via px.launch_app
|
|
45
|
-
|
|
46
|
-
Parameters
|
|
47
|
-
----------
|
|
48
|
-
dataframe : pandas.DataFrame
|
|
49
|
-
The pandas dataframe containing the data to analyze
|
|
50
|
-
schema : phoenix.Schema
|
|
51
|
-
the schema of the dataset. Maps dataframe columns to the appropriate
|
|
52
|
-
model inference dimensions (features, predictions, actuals).
|
|
53
|
-
name : str, optional
|
|
54
|
-
The name of the dataset. If not provided, a random name will be generated.
|
|
55
|
-
Is helpful for identifying the dataset in the application.
|
|
56
|
-
|
|
57
|
-
Returns
|
|
58
|
-
-------
|
|
59
|
-
dataset : Dataset
|
|
60
|
-
The dataset object that can be used in a phoenix session
|
|
61
|
-
|
|
62
|
-
Examples
|
|
63
|
-
--------
|
|
64
|
-
>>> primary_dataset = px.Dataset(dataframe=production_dataframe, schema=schema, name="primary")
|
|
65
|
-
"""
|
|
66
|
-
|
|
67
|
-
_data_file_name: str = "data.parquet"
|
|
68
|
-
_schema_file_name: str = "schema.json"
|
|
69
|
-
_is_persisted: bool = False
|
|
70
|
-
_is_empty: bool = False
|
|
71
|
-
|
|
72
|
-
def __init__(
|
|
73
|
-
self,
|
|
74
|
-
dataframe: DataFrame,
|
|
75
|
-
schema: Union[Schema, SchemaLike],
|
|
76
|
-
name: Optional[str] = None,
|
|
77
|
-
):
|
|
78
|
-
# allow for schema like objects
|
|
79
|
-
if not isinstance(schema, Schema):
|
|
80
|
-
schema = _get_schema_from_unknown_schema_param(schema)
|
|
81
|
-
errors = validate_dataset_inputs(
|
|
82
|
-
dataframe=dataframe,
|
|
83
|
-
schema=schema,
|
|
84
|
-
)
|
|
85
|
-
if errors:
|
|
86
|
-
raise err.DatasetError(errors)
|
|
87
|
-
dataframe, schema = _parse_dataframe_and_schema(dataframe, schema)
|
|
88
|
-
dataframe, schema = _normalize_timestamps(
|
|
89
|
-
dataframe, schema, default_timestamp=Timestamp.utcnow()
|
|
90
|
-
)
|
|
91
|
-
dataframe = _sort_dataframe_rows_by_timestamp(dataframe, schema)
|
|
92
|
-
self.__dataframe: DataFrame = dataframe
|
|
93
|
-
self.__schema: Schema = schema
|
|
94
|
-
self.__name: str = (
|
|
95
|
-
name if name is not None else f"{GENERATED_DATASET_NAME_PREFIX}{str(uuid.uuid4())}"
|
|
96
|
-
)
|
|
97
|
-
self._is_empty = self.dataframe.empty
|
|
98
|
-
logger.info(f"""Dataset: {self.__name} initialized""")
|
|
99
|
-
|
|
100
|
-
def __repr__(self) -> str:
|
|
101
|
-
return f'<Dataset "{self.name}">'
|
|
102
|
-
|
|
103
|
-
@property
|
|
104
|
-
def dataframe(self) -> DataFrame:
|
|
105
|
-
return self.__dataframe
|
|
106
|
-
|
|
107
|
-
@property
|
|
108
|
-
def schema(self) -> "Schema":
|
|
109
|
-
return self.__schema
|
|
110
|
-
|
|
111
|
-
@property
|
|
112
|
-
def name(self) -> str:
|
|
113
|
-
return self.__name
|
|
114
|
-
|
|
115
|
-
@classmethod
|
|
116
|
-
def from_name(cls, name: str) -> "Dataset":
|
|
117
|
-
"""Retrieves a dataset by name from the file system"""
|
|
118
|
-
directory = DATASET_DIR / name
|
|
119
|
-
df = read_parquet(directory / cls._data_file_name)
|
|
120
|
-
with open(directory / cls._schema_file_name) as schema_file:
|
|
121
|
-
schema_json = schema_file.read()
|
|
122
|
-
schema = Schema.from_json(schema_json)
|
|
123
|
-
return cls(df, schema, name)
|
|
124
13
|
|
|
14
|
+
@deprecated_class("phoenix.Dataset is deprecated, use phoenix.Inference instead.")
|
|
15
|
+
class Dataset(Inferences):
|
|
125
16
|
@classmethod
|
|
17
|
+
@deprecated("Dataset.from_open_inference is deprecated and will be removed.")
|
|
126
18
|
def from_open_inference(cls, dataframe: DataFrame) -> "Dataset":
|
|
127
19
|
schema = Schema()
|
|
128
20
|
column_renaming: Dict[str, str] = {}
|
|
@@ -276,406 +168,6 @@ class Dataset:
|
|
|
276
168
|
schema,
|
|
277
169
|
)
|
|
278
170
|
|
|
279
|
-
def to_disc(self) -> None:
|
|
280
|
-
"""writes the data and schema to disc"""
|
|
281
|
-
directory = DATASET_DIR / self.name
|
|
282
|
-
directory.mkdir(parents=True, exist_ok=True)
|
|
283
|
-
self.dataframe.to_parquet(
|
|
284
|
-
directory / self._data_file_name,
|
|
285
|
-
allow_truncated_timestamps=True,
|
|
286
|
-
coerce_timestamps="ms",
|
|
287
|
-
)
|
|
288
|
-
schema_json_data = self.schema.to_json()
|
|
289
|
-
with open(directory / self._schema_file_name, "w+") as schema_file:
|
|
290
|
-
schema_file.write(schema_json_data)
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
def _parse_dataframe_and_schema(dataframe: DataFrame, schema: Schema) -> Tuple[DataFrame, Schema]:
|
|
294
|
-
"""
|
|
295
|
-
Parses a dataframe according to a schema, infers feature columns names when
|
|
296
|
-
they are not explicitly provided, and removes excluded column names from
|
|
297
|
-
both dataframe and schema.
|
|
298
|
-
|
|
299
|
-
Removes column names in `schema.excluded_column_names` from the input dataframe and schema. To
|
|
300
|
-
remove an embedding feature and all associated columns, add the name of the embedding feature to
|
|
301
|
-
`schema.excluded_column_names` rather than the associated column names. If
|
|
302
|
-
`schema.feature_column_names` is `None`, automatically discovers features by adding all column
|
|
303
|
-
names present in the dataframe but not included in any other schema fields.
|
|
304
|
-
"""
|
|
305
|
-
|
|
306
|
-
unseen_excluded_column_names: Set[str] = (
|
|
307
|
-
set(schema.excluded_column_names) if schema.excluded_column_names is not None else set()
|
|
308
|
-
)
|
|
309
|
-
unseen_column_names: Set[str] = set(dataframe.columns.to_list())
|
|
310
|
-
column_name_to_include: Dict[str, bool] = {}
|
|
311
|
-
schema_patch: Dict[SchemaFieldName, SchemaFieldValue] = {}
|
|
312
|
-
|
|
313
|
-
for schema_field_name in SINGLE_COLUMN_SCHEMA_FIELD_NAMES:
|
|
314
|
-
_check_single_column_schema_field_for_excluded_columns(
|
|
315
|
-
schema,
|
|
316
|
-
schema_field_name,
|
|
317
|
-
unseen_excluded_column_names,
|
|
318
|
-
schema_patch,
|
|
319
|
-
column_name_to_include,
|
|
320
|
-
unseen_column_names,
|
|
321
|
-
)
|
|
322
|
-
|
|
323
|
-
for schema_field_name in MULTI_COLUMN_SCHEMA_FIELD_NAMES:
|
|
324
|
-
_check_multi_column_schema_field_for_excluded_columns(
|
|
325
|
-
schema,
|
|
326
|
-
schema_field_name,
|
|
327
|
-
unseen_excluded_column_names,
|
|
328
|
-
schema_patch,
|
|
329
|
-
column_name_to_include,
|
|
330
|
-
unseen_column_names,
|
|
331
|
-
)
|
|
332
|
-
|
|
333
|
-
if schema.embedding_feature_column_names:
|
|
334
|
-
_check_embedding_features_schema_field_for_excluded_columns(
|
|
335
|
-
schema.embedding_feature_column_names,
|
|
336
|
-
unseen_excluded_column_names,
|
|
337
|
-
schema_patch,
|
|
338
|
-
column_name_to_include,
|
|
339
|
-
unseen_column_names,
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
for llm_schema_field_name in LLM_SCHEMA_FIELD_NAMES:
|
|
343
|
-
embedding_column_name_mapping = getattr(schema, llm_schema_field_name)
|
|
344
|
-
if isinstance(embedding_column_name_mapping, EmbeddingColumnNames):
|
|
345
|
-
_check_embedding_column_names_for_excluded_columns(
|
|
346
|
-
embedding_column_name_mapping,
|
|
347
|
-
column_name_to_include,
|
|
348
|
-
unseen_column_names,
|
|
349
|
-
)
|
|
350
|
-
|
|
351
|
-
if not schema.feature_column_names and unseen_column_names:
|
|
352
|
-
_discover_feature_columns(
|
|
353
|
-
dataframe,
|
|
354
|
-
unseen_excluded_column_names,
|
|
355
|
-
schema_patch,
|
|
356
|
-
column_name_to_include,
|
|
357
|
-
unseen_column_names,
|
|
358
|
-
)
|
|
359
|
-
|
|
360
|
-
if unseen_excluded_column_names:
|
|
361
|
-
logger.warning(
|
|
362
|
-
"The following columns and embedding features were excluded in the schema but were "
|
|
363
|
-
"not found in the dataframe: {}".format(", ".join(unseen_excluded_column_names))
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
parsed_dataframe, parsed_schema = _create_and_normalize_dataframe_and_schema(
|
|
367
|
-
dataframe, schema, schema_patch, column_name_to_include
|
|
368
|
-
)
|
|
369
|
-
|
|
370
|
-
return parsed_dataframe, parsed_schema
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
def _check_single_column_schema_field_for_excluded_columns(
|
|
374
|
-
schema: Schema,
|
|
375
|
-
schema_field_name: str,
|
|
376
|
-
unseen_excluded_column_names: Set[str],
|
|
377
|
-
schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
|
|
378
|
-
column_name_to_include: Dict[str, bool],
|
|
379
|
-
unseen_column_names: Set[str],
|
|
380
|
-
) -> None:
|
|
381
|
-
"""
|
|
382
|
-
Checks single-column schema fields for excluded column names.
|
|
383
|
-
"""
|
|
384
|
-
column_name: str = getattr(schema, schema_field_name)
|
|
385
|
-
include_column: bool = column_name not in unseen_excluded_column_names
|
|
386
|
-
column_name_to_include[column_name] = include_column
|
|
387
|
-
if not include_column:
|
|
388
|
-
schema_patch[schema_field_name] = None
|
|
389
|
-
unseen_excluded_column_names.discard(column_name)
|
|
390
|
-
logger.debug(f"excluded {schema_field_name}: {column_name}")
|
|
391
|
-
unseen_column_names.discard(column_name)
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
def _check_multi_column_schema_field_for_excluded_columns(
|
|
395
|
-
schema: Schema,
|
|
396
|
-
schema_field_name: str,
|
|
397
|
-
unseen_excluded_column_names: Set[str],
|
|
398
|
-
schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
|
|
399
|
-
column_name_to_include: Dict[str, bool],
|
|
400
|
-
unseen_column_names: Set[str],
|
|
401
|
-
) -> None:
|
|
402
|
-
"""
|
|
403
|
-
Checks multi-column schema fields for excluded columns names.
|
|
404
|
-
"""
|
|
405
|
-
column_names: Optional[List[str]] = getattr(schema, schema_field_name)
|
|
406
|
-
if column_names:
|
|
407
|
-
included_column_names: List[str] = []
|
|
408
|
-
excluded_column_names: List[str] = []
|
|
409
|
-
for column_name in column_names:
|
|
410
|
-
is_included_column = column_name not in unseen_excluded_column_names
|
|
411
|
-
column_name_to_include[column_name] = is_included_column
|
|
412
|
-
if is_included_column:
|
|
413
|
-
included_column_names.append(column_name)
|
|
414
|
-
else:
|
|
415
|
-
excluded_column_names.append(column_name)
|
|
416
|
-
unseen_excluded_column_names.discard(column_name)
|
|
417
|
-
logger.debug(f"excluded {schema_field_name}: {column_name}")
|
|
418
|
-
unseen_column_names.discard(column_name)
|
|
419
|
-
schema_patch[schema_field_name] = included_column_names if included_column_names else None
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
def _check_embedding_features_schema_field_for_excluded_columns(
|
|
423
|
-
embedding_features: EmbeddingFeatures,
|
|
424
|
-
unseen_excluded_column_names: Set[str],
|
|
425
|
-
schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
|
|
426
|
-
column_name_to_include: Dict[str, bool],
|
|
427
|
-
unseen_column_names: Set[str],
|
|
428
|
-
) -> None:
|
|
429
|
-
"""
|
|
430
|
-
Check embedding features for excluded column names.
|
|
431
|
-
"""
|
|
432
|
-
included_embedding_features: EmbeddingFeatures = {}
|
|
433
|
-
for (
|
|
434
|
-
embedding_feature_name,
|
|
435
|
-
embedding_column_name_mapping,
|
|
436
|
-
) in embedding_features.items():
|
|
437
|
-
include_embedding_feature = embedding_feature_name not in unseen_excluded_column_names
|
|
438
|
-
if include_embedding_feature:
|
|
439
|
-
included_embedding_features[embedding_feature_name] = deepcopy(
|
|
440
|
-
embedding_column_name_mapping
|
|
441
|
-
)
|
|
442
|
-
else:
|
|
443
|
-
unseen_excluded_column_names.discard(embedding_feature_name)
|
|
444
|
-
|
|
445
|
-
for embedding_field in fields(embedding_column_name_mapping):
|
|
446
|
-
column_name: Optional[str] = getattr(
|
|
447
|
-
embedding_column_name_mapping, embedding_field.name
|
|
448
|
-
)
|
|
449
|
-
if column_name is not None:
|
|
450
|
-
column_name_to_include[column_name] = include_embedding_feature
|
|
451
|
-
if (
|
|
452
|
-
column_name != embedding_feature_name
|
|
453
|
-
and column_name in unseen_excluded_column_names
|
|
454
|
-
):
|
|
455
|
-
logger.warning(
|
|
456
|
-
f"Excluding embedding feature columns such as "
|
|
457
|
-
f'"{column_name}" has no effect; instead exclude the '
|
|
458
|
-
f'corresponding embedding feature name "{embedding_feature_name}".'
|
|
459
|
-
)
|
|
460
|
-
unseen_excluded_column_names.discard(column_name)
|
|
461
|
-
unseen_column_names.discard(column_name)
|
|
462
|
-
schema_patch["embedding_feature_column_names"] = (
|
|
463
|
-
included_embedding_features if included_embedding_features else None
|
|
464
|
-
)
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
def _check_embedding_column_names_for_excluded_columns(
|
|
468
|
-
embedding_column_name_mapping: EmbeddingColumnNames,
|
|
469
|
-
column_name_to_include: Dict[str, bool],
|
|
470
|
-
unseen_column_names: Set[str],
|
|
471
|
-
) -> None:
|
|
472
|
-
"""
|
|
473
|
-
Check embedding column names for excluded column names.
|
|
474
|
-
"""
|
|
475
|
-
for embedding_field in fields(embedding_column_name_mapping):
|
|
476
|
-
column_name: Optional[str] = getattr(embedding_column_name_mapping, embedding_field.name)
|
|
477
|
-
if column_name is not None:
|
|
478
|
-
column_name_to_include[column_name] = True
|
|
479
|
-
unseen_column_names.discard(column_name)
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
def _discover_feature_columns(
|
|
483
|
-
dataframe: DataFrame,
|
|
484
|
-
unseen_excluded_column_names: Set[str],
|
|
485
|
-
schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
|
|
486
|
-
column_name_to_include: Dict[str, bool],
|
|
487
|
-
unseen_column_names: Set[str],
|
|
488
|
-
) -> None:
|
|
489
|
-
"""
|
|
490
|
-
Adds unseen and un-excluded columns as features, with the exception of "prediction_id"
|
|
491
|
-
which is reserved
|
|
492
|
-
"""
|
|
493
|
-
discovered_feature_column_names = []
|
|
494
|
-
for column_name in unseen_column_names:
|
|
495
|
-
if column_name not in unseen_excluded_column_names and column_name != "prediction_id":
|
|
496
|
-
discovered_feature_column_names.append(column_name)
|
|
497
|
-
column_name_to_include[column_name] = True
|
|
498
|
-
else:
|
|
499
|
-
unseen_excluded_column_names.discard(column_name)
|
|
500
|
-
logger.debug(f"excluded feature: {column_name}")
|
|
501
|
-
original_column_positions: List[int] = dataframe.columns.get_indexer(
|
|
502
|
-
discovered_feature_column_names
|
|
503
|
-
) # type: ignore
|
|
504
|
-
feature_column_name_to_position: Dict[str, int] = dict(
|
|
505
|
-
zip(discovered_feature_column_names, original_column_positions)
|
|
506
|
-
)
|
|
507
|
-
discovered_feature_column_names.sort(key=lambda col: feature_column_name_to_position[col])
|
|
508
|
-
schema_patch["feature_column_names"] = discovered_feature_column_names
|
|
509
|
-
logger.debug(
|
|
510
|
-
"Discovered feature column names: {}".format(", ".join(discovered_feature_column_names))
|
|
511
|
-
)
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
def _create_and_normalize_dataframe_and_schema(
|
|
515
|
-
dataframe: DataFrame,
|
|
516
|
-
schema: Schema,
|
|
517
|
-
schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
|
|
518
|
-
column_name_to_include: Dict[str, bool],
|
|
519
|
-
) -> Tuple[DataFrame, Schema]:
|
|
520
|
-
"""
|
|
521
|
-
Creates new dataframe and schema objects to reflect excluded column names
|
|
522
|
-
and discovered features. This also normalizes dataframe columns to ensure a
|
|
523
|
-
standard set of columns (i.e. timestamp and prediction_id) and datatypes for
|
|
524
|
-
those columns.
|
|
525
|
-
"""
|
|
526
|
-
included_column_names: List[str] = []
|
|
527
|
-
for column_name in dataframe.columns:
|
|
528
|
-
if column_name_to_include.get(str(column_name), False):
|
|
529
|
-
included_column_names.append(str(column_name))
|
|
530
|
-
parsed_dataframe = dataframe[included_column_names].copy()
|
|
531
|
-
parsed_schema = replace(schema, excluded_column_names=None, **schema_patch) # type: ignore
|
|
532
|
-
pred_id_col_name = parsed_schema.prediction_id_column_name
|
|
533
|
-
if pred_id_col_name is None:
|
|
534
|
-
parsed_schema = replace(parsed_schema, prediction_id_column_name="prediction_id")
|
|
535
|
-
parsed_dataframe["prediction_id"] = _add_prediction_id(len(parsed_dataframe))
|
|
536
|
-
elif is_numeric_dtype(parsed_dataframe.dtypes[pred_id_col_name]):
|
|
537
|
-
parsed_dataframe[pred_id_col_name] = parsed_dataframe[pred_id_col_name].astype(str)
|
|
538
|
-
for embedding in (
|
|
539
|
-
[parsed_schema.prompt_column_names, parsed_schema.response_column_names]
|
|
540
|
-
+ list(parsed_schema.embedding_feature_column_names.values())
|
|
541
|
-
if parsed_schema.embedding_feature_column_names is not None
|
|
542
|
-
else []
|
|
543
|
-
):
|
|
544
|
-
if not isinstance(embedding, EmbeddingColumnNames):
|
|
545
|
-
continue
|
|
546
|
-
vector_column_name = embedding.vector_column_name
|
|
547
|
-
if vector_column_name not in parsed_dataframe.columns:
|
|
548
|
-
continue
|
|
549
|
-
parsed_dataframe.loc[:, vector_column_name] = _coerce_vectors_as_arrays_if_necessary(
|
|
550
|
-
parsed_dataframe.loc[:, vector_column_name],
|
|
551
|
-
vector_column_name,
|
|
552
|
-
)
|
|
553
|
-
return parsed_dataframe, parsed_schema
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
def _coerce_vectors_as_arrays_if_necessary(
|
|
557
|
-
series: "pd.Series[Any]",
|
|
558
|
-
column_name: str,
|
|
559
|
-
) -> "pd.Series[Any]":
|
|
560
|
-
not_na = ~series.isna()
|
|
561
|
-
if not_na.sum() == 0:
|
|
562
|
-
return series
|
|
563
|
-
if invalid_types := set(map(type, series.loc[not_na])) - {np.ndarray}:
|
|
564
|
-
logger.warning(
|
|
565
|
-
f"converting items in column `{column_name}` to numpy.ndarray, "
|
|
566
|
-
f"because they have the following "
|
|
567
|
-
f"type{'s' if len(invalid_types) > 1 else ''}: "
|
|
568
|
-
f"{', '.join(map(lambda t: t.__name__, invalid_types))}"
|
|
569
|
-
)
|
|
570
|
-
return series.mask(not_na, series.loc[not_na].apply(np.array))
|
|
571
|
-
return series
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
def _sort_dataframe_rows_by_timestamp(dataframe: DataFrame, schema: Schema) -> DataFrame:
|
|
575
|
-
"""
|
|
576
|
-
Sorts dataframe rows by timestamp.
|
|
577
|
-
"""
|
|
578
|
-
timestamp_column_name = schema.timestamp_column_name
|
|
579
|
-
if timestamp_column_name is None:
|
|
580
|
-
raise ValueError("Schema must specify a timestamp column name.")
|
|
581
|
-
dataframe.set_index(timestamp_column_name, drop=False, inplace=True)
|
|
582
|
-
dataframe.sort_index(inplace=True)
|
|
583
|
-
return dataframe
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
def _normalize_timestamps(
|
|
587
|
-
dataframe: DataFrame,
|
|
588
|
-
schema: Schema,
|
|
589
|
-
default_timestamp: Timestamp,
|
|
590
|
-
) -> Tuple[DataFrame, Schema]:
|
|
591
|
-
"""
|
|
592
|
-
Ensures that the dataframe has a timestamp column and the schema has a timestamp field. If the
|
|
593
|
-
input dataframe contains a Unix or datetime timestamp or ISO8601 timestamp strings column, it
|
|
594
|
-
is converted to UTC timezone-aware timestamp. If the input dataframe and schema do not contain
|
|
595
|
-
timestamps, the default timestamp is used. If a timestamp is timezone-naive, it is localized
|
|
596
|
-
as per local timezone and then converted to UTC
|
|
597
|
-
"""
|
|
598
|
-
timestamp_column: Series[Timestamp]
|
|
599
|
-
if (timestamp_column_name := schema.timestamp_column_name) is None:
|
|
600
|
-
timestamp_column_name = "timestamp"
|
|
601
|
-
schema = replace(schema, timestamp_column_name=timestamp_column_name)
|
|
602
|
-
timestamp_column = (
|
|
603
|
-
Series([default_timestamp] * len(dataframe), index=dataframe.index)
|
|
604
|
-
if len(dataframe)
|
|
605
|
-
else Series([default_timestamp]).iloc[:0].set_axis(dataframe.index, axis=0)
|
|
606
|
-
)
|
|
607
|
-
else:
|
|
608
|
-
timestamp_column = normalize_timestamps(
|
|
609
|
-
dataframe[timestamp_column_name],
|
|
610
|
-
)
|
|
611
|
-
dataframe[timestamp_column_name] = timestamp_column
|
|
612
|
-
return dataframe, schema
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
def _get_schema_from_unknown_schema_param(schemaLike: SchemaLike) -> Schema:
|
|
616
|
-
"""
|
|
617
|
-
Compatibility function for converting from arize.utils.types.Schema to phoenix.datasets.Schema
|
|
618
|
-
"""
|
|
619
|
-
try:
|
|
620
|
-
from arize.utils.types import (
|
|
621
|
-
EmbeddingColumnNames as ArizeEmbeddingColumnNames, # fmt: off type: ignore
|
|
622
|
-
)
|
|
623
|
-
from arize.utils.types import Schema as ArizeSchema
|
|
624
|
-
|
|
625
|
-
if not isinstance(schemaLike, ArizeSchema):
|
|
626
|
-
raise ValueError("Unknown schema passed to Dataset. Please pass a phoenix Schema")
|
|
627
|
-
|
|
628
|
-
embedding_feature_column_names: Dict[str, EmbeddingColumnNames] = {}
|
|
629
|
-
if schemaLike.embedding_feature_column_names is not None:
|
|
630
|
-
for (
|
|
631
|
-
embedding_name,
|
|
632
|
-
arize_embedding_feature_column_names,
|
|
633
|
-
) in schemaLike.embedding_feature_column_names.items():
|
|
634
|
-
if isinstance(arize_embedding_feature_column_names, ArizeEmbeddingColumnNames):
|
|
635
|
-
embedding_feature_column_names[embedding_name] = EmbeddingColumnNames(
|
|
636
|
-
vector_column_name=arize_embedding_feature_column_names.vector_column_name,
|
|
637
|
-
link_to_data_column_name=arize_embedding_feature_column_names.link_to_data_column_name,
|
|
638
|
-
raw_data_column_name=arize_embedding_feature_column_names.data_column_name,
|
|
639
|
-
)
|
|
640
|
-
prompt_column_names: Optional[EmbeddingColumnNames] = None
|
|
641
|
-
if schemaLike.prompt_column_names is not None and isinstance(
|
|
642
|
-
schemaLike.prompt_column_names, ArizeEmbeddingColumnNames
|
|
643
|
-
):
|
|
644
|
-
prompt_column_names = EmbeddingColumnNames(
|
|
645
|
-
vector_column_name=schemaLike.prompt_column_names.vector_column_name,
|
|
646
|
-
raw_data_column_name=schemaLike.prompt_column_names.data_column_name,
|
|
647
|
-
link_to_data_column_name=schemaLike.prompt_column_names.link_to_data_column_name,
|
|
648
|
-
)
|
|
649
|
-
response_column_names: Optional[EmbeddingColumnNames] = None
|
|
650
|
-
if schemaLike.response_column_names is not None and isinstance(
|
|
651
|
-
schemaLike.response_column_names, ArizeEmbeddingColumnNames
|
|
652
|
-
):
|
|
653
|
-
response_column_names = EmbeddingColumnNames(
|
|
654
|
-
vector_column_name=schemaLike.response_column_names.vector_column_name,
|
|
655
|
-
raw_data_column_name=schemaLike.response_column_names.data_column_name,
|
|
656
|
-
link_to_data_column_name=schemaLike.response_column_names.link_to_data_column_name,
|
|
657
|
-
)
|
|
658
|
-
return Schema(
|
|
659
|
-
feature_column_names=schemaLike.feature_column_names,
|
|
660
|
-
tag_column_names=schemaLike.tag_column_names,
|
|
661
|
-
prediction_label_column_name=schemaLike.prediction_label_column_name,
|
|
662
|
-
actual_label_column_name=schemaLike.actual_label_column_name,
|
|
663
|
-
prediction_id_column_name=schemaLike.prediction_id_column_name,
|
|
664
|
-
timestamp_column_name=schemaLike.timestamp_column_name,
|
|
665
|
-
embedding_feature_column_names=embedding_feature_column_names,
|
|
666
|
-
prompt_column_names=prompt_column_names,
|
|
667
|
-
response_column_names=response_column_names,
|
|
668
|
-
)
|
|
669
|
-
except Exception:
|
|
670
|
-
raise ValueError(
|
|
671
|
-
"""Unsupported Arize Schema. Please pass a phoenix Schema or update
|
|
672
|
-
to the latest version of Arize."""
|
|
673
|
-
)
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
def _add_prediction_id(num_rows: int) -> List[str]:
|
|
677
|
-
return [str(uuid.uuid4()) for _ in range(num_rows)]
|
|
678
|
-
|
|
679
171
|
|
|
680
172
|
class OpenInferenceCategory(Enum):
|
|
681
173
|
id = "id"
|
|
@@ -720,7 +212,3 @@ def _parse_open_inference_column_name(column_name: str) -> _OpenInferenceColumnN
|
|
|
720
212
|
name=extract.get("name", ""),
|
|
721
213
|
)
|
|
722
214
|
raise ValueError(f"Invalid format for column name: {column_name}")
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
# A dataset with no data. Useful for stubs
|
|
726
|
-
EMPTY_DATASET = Dataset(pd.DataFrame(), schema=Schema())
|