arize-phoenix 3.19.4__py3-none-any.whl → 3.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

@@ -1,128 +1,20 @@
1
- import logging
2
1
  import re
3
- import uuid
4
- from copy import deepcopy
5
- from dataclasses import dataclass, fields, replace
2
+ from dataclasses import dataclass, replace
6
3
  from enum import Enum
7
4
  from itertools import groupby
8
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
5
+ from typing import Dict
9
6
 
10
- import numpy as np
11
- import pandas as pd
12
- from pandas import DataFrame, Series, Timestamp, read_parquet
13
- from pandas.api.types import (
14
- is_numeric_dtype,
15
- )
16
- from typing_extensions import TypeAlias
7
+ from pandas import DataFrame
17
8
 
18
- from phoenix.config import DATASET_DIR, GENERATED_DATASET_NAME_PREFIX
19
- from phoenix.datetime_utils import normalize_timestamps
9
+ from phoenix.inferences.inferences import Inferences
10
+ from phoenix.inferences.schema import EmbeddingColumnNames, RetrievalEmbeddingColumnNames, Schema
11
+ from phoenix.utilities.deprecation import deprecated, deprecated_class
20
12
 
21
- from . import errors as err
22
- from .schema import (
23
- LLM_SCHEMA_FIELD_NAMES,
24
- MULTI_COLUMN_SCHEMA_FIELD_NAMES,
25
- SINGLE_COLUMN_SCHEMA_FIELD_NAMES,
26
- EmbeddingColumnNames,
27
- EmbeddingFeatures,
28
- RetrievalEmbeddingColumnNames,
29
- Schema,
30
- SchemaFieldName,
31
- SchemaFieldValue,
32
- )
33
- from .validation import validate_dataset_inputs
34
-
35
- logger = logging.getLogger(__name__)
36
-
37
- # A schema like object. Not recommended to use this directly
38
- SchemaLike: TypeAlias = Any
39
-
40
-
41
- class Dataset:
42
- """
43
- A dataset to use for analysis using phoenix.
44
- Used to construct a phoenix session via px.launch_app
45
-
46
- Parameters
47
- ----------
48
- dataframe : pandas.DataFrame
49
- The pandas dataframe containing the data to analyze
50
- schema : phoenix.Schema
51
- the schema of the dataset. Maps dataframe columns to the appropriate
52
- model inference dimensions (features, predictions, actuals).
53
- name : str, optional
54
- The name of the dataset. If not provided, a random name will be generated.
55
- Is helpful for identifying the dataset in the application.
56
-
57
- Returns
58
- -------
59
- dataset : Dataset
60
- The dataset object that can be used in a phoenix session
61
-
62
- Examples
63
- --------
64
- >>> primary_dataset = px.Dataset(dataframe=production_dataframe, schema=schema, name="primary")
65
- """
66
-
67
- _data_file_name: str = "data.parquet"
68
- _schema_file_name: str = "schema.json"
69
- _is_persisted: bool = False
70
- _is_empty: bool = False
71
-
72
- def __init__(
73
- self,
74
- dataframe: DataFrame,
75
- schema: Union[Schema, SchemaLike],
76
- name: Optional[str] = None,
77
- ):
78
- # allow for schema like objects
79
- if not isinstance(schema, Schema):
80
- schema = _get_schema_from_unknown_schema_param(schema)
81
- errors = validate_dataset_inputs(
82
- dataframe=dataframe,
83
- schema=schema,
84
- )
85
- if errors:
86
- raise err.DatasetError(errors)
87
- dataframe, schema = _parse_dataframe_and_schema(dataframe, schema)
88
- dataframe, schema = _normalize_timestamps(
89
- dataframe, schema, default_timestamp=Timestamp.utcnow()
90
- )
91
- dataframe = _sort_dataframe_rows_by_timestamp(dataframe, schema)
92
- self.__dataframe: DataFrame = dataframe
93
- self.__schema: Schema = schema
94
- self.__name: str = (
95
- name if name is not None else f"{GENERATED_DATASET_NAME_PREFIX}{str(uuid.uuid4())}"
96
- )
97
- self._is_empty = self.dataframe.empty
98
- logger.info(f"""Dataset: {self.__name} initialized""")
99
-
100
- def __repr__(self) -> str:
101
- return f'<Dataset "{self.name}">'
102
-
103
- @property
104
- def dataframe(self) -> DataFrame:
105
- return self.__dataframe
106
-
107
- @property
108
- def schema(self) -> "Schema":
109
- return self.__schema
110
-
111
- @property
112
- def name(self) -> str:
113
- return self.__name
114
-
115
- @classmethod
116
- def from_name(cls, name: str) -> "Dataset":
117
- """Retrieves a dataset by name from the file system"""
118
- directory = DATASET_DIR / name
119
- df = read_parquet(directory / cls._data_file_name)
120
- with open(directory / cls._schema_file_name) as schema_file:
121
- schema_json = schema_file.read()
122
- schema = Schema.from_json(schema_json)
123
- return cls(df, schema, name)
124
13
 
14
+ @deprecated_class("phoenix.Dataset is deprecated, use phoenix.Inference instead.")
15
+ class Dataset(Inferences):
125
16
  @classmethod
17
+ @deprecated("Dataset.from_open_inference is deprecated and will be removed.")
126
18
  def from_open_inference(cls, dataframe: DataFrame) -> "Dataset":
127
19
  schema = Schema()
128
20
  column_renaming: Dict[str, str] = {}
@@ -276,406 +168,6 @@ class Dataset:
276
168
  schema,
277
169
  )
278
170
 
279
- def to_disc(self) -> None:
280
- """writes the data and schema to disc"""
281
- directory = DATASET_DIR / self.name
282
- directory.mkdir(parents=True, exist_ok=True)
283
- self.dataframe.to_parquet(
284
- directory / self._data_file_name,
285
- allow_truncated_timestamps=True,
286
- coerce_timestamps="ms",
287
- )
288
- schema_json_data = self.schema.to_json()
289
- with open(directory / self._schema_file_name, "w+") as schema_file:
290
- schema_file.write(schema_json_data)
291
-
292
-
293
- def _parse_dataframe_and_schema(dataframe: DataFrame, schema: Schema) -> Tuple[DataFrame, Schema]:
294
- """
295
- Parses a dataframe according to a schema, infers feature columns names when
296
- they are not explicitly provided, and removes excluded column names from
297
- both dataframe and schema.
298
-
299
- Removes column names in `schema.excluded_column_names` from the input dataframe and schema. To
300
- remove an embedding feature and all associated columns, add the name of the embedding feature to
301
- `schema.excluded_column_names` rather than the associated column names. If
302
- `schema.feature_column_names` is `None`, automatically discovers features by adding all column
303
- names present in the dataframe but not included in any other schema fields.
304
- """
305
-
306
- unseen_excluded_column_names: Set[str] = (
307
- set(schema.excluded_column_names) if schema.excluded_column_names is not None else set()
308
- )
309
- unseen_column_names: Set[str] = set(dataframe.columns.to_list())
310
- column_name_to_include: Dict[str, bool] = {}
311
- schema_patch: Dict[SchemaFieldName, SchemaFieldValue] = {}
312
-
313
- for schema_field_name in SINGLE_COLUMN_SCHEMA_FIELD_NAMES:
314
- _check_single_column_schema_field_for_excluded_columns(
315
- schema,
316
- schema_field_name,
317
- unseen_excluded_column_names,
318
- schema_patch,
319
- column_name_to_include,
320
- unseen_column_names,
321
- )
322
-
323
- for schema_field_name in MULTI_COLUMN_SCHEMA_FIELD_NAMES:
324
- _check_multi_column_schema_field_for_excluded_columns(
325
- schema,
326
- schema_field_name,
327
- unseen_excluded_column_names,
328
- schema_patch,
329
- column_name_to_include,
330
- unseen_column_names,
331
- )
332
-
333
- if schema.embedding_feature_column_names:
334
- _check_embedding_features_schema_field_for_excluded_columns(
335
- schema.embedding_feature_column_names,
336
- unseen_excluded_column_names,
337
- schema_patch,
338
- column_name_to_include,
339
- unseen_column_names,
340
- )
341
-
342
- for llm_schema_field_name in LLM_SCHEMA_FIELD_NAMES:
343
- embedding_column_name_mapping = getattr(schema, llm_schema_field_name)
344
- if isinstance(embedding_column_name_mapping, EmbeddingColumnNames):
345
- _check_embedding_column_names_for_excluded_columns(
346
- embedding_column_name_mapping,
347
- column_name_to_include,
348
- unseen_column_names,
349
- )
350
-
351
- if not schema.feature_column_names and unseen_column_names:
352
- _discover_feature_columns(
353
- dataframe,
354
- unseen_excluded_column_names,
355
- schema_patch,
356
- column_name_to_include,
357
- unseen_column_names,
358
- )
359
-
360
- if unseen_excluded_column_names:
361
- logger.warning(
362
- "The following columns and embedding features were excluded in the schema but were "
363
- "not found in the dataframe: {}".format(", ".join(unseen_excluded_column_names))
364
- )
365
-
366
- parsed_dataframe, parsed_schema = _create_and_normalize_dataframe_and_schema(
367
- dataframe, schema, schema_patch, column_name_to_include
368
- )
369
-
370
- return parsed_dataframe, parsed_schema
371
-
372
-
373
- def _check_single_column_schema_field_for_excluded_columns(
374
- schema: Schema,
375
- schema_field_name: str,
376
- unseen_excluded_column_names: Set[str],
377
- schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
378
- column_name_to_include: Dict[str, bool],
379
- unseen_column_names: Set[str],
380
- ) -> None:
381
- """
382
- Checks single-column schema fields for excluded column names.
383
- """
384
- column_name: str = getattr(schema, schema_field_name)
385
- include_column: bool = column_name not in unseen_excluded_column_names
386
- column_name_to_include[column_name] = include_column
387
- if not include_column:
388
- schema_patch[schema_field_name] = None
389
- unseen_excluded_column_names.discard(column_name)
390
- logger.debug(f"excluded {schema_field_name}: {column_name}")
391
- unseen_column_names.discard(column_name)
392
-
393
-
394
- def _check_multi_column_schema_field_for_excluded_columns(
395
- schema: Schema,
396
- schema_field_name: str,
397
- unseen_excluded_column_names: Set[str],
398
- schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
399
- column_name_to_include: Dict[str, bool],
400
- unseen_column_names: Set[str],
401
- ) -> None:
402
- """
403
- Checks multi-column schema fields for excluded columns names.
404
- """
405
- column_names: Optional[List[str]] = getattr(schema, schema_field_name)
406
- if column_names:
407
- included_column_names: List[str] = []
408
- excluded_column_names: List[str] = []
409
- for column_name in column_names:
410
- is_included_column = column_name not in unseen_excluded_column_names
411
- column_name_to_include[column_name] = is_included_column
412
- if is_included_column:
413
- included_column_names.append(column_name)
414
- else:
415
- excluded_column_names.append(column_name)
416
- unseen_excluded_column_names.discard(column_name)
417
- logger.debug(f"excluded {schema_field_name}: {column_name}")
418
- unseen_column_names.discard(column_name)
419
- schema_patch[schema_field_name] = included_column_names if included_column_names else None
420
-
421
-
422
- def _check_embedding_features_schema_field_for_excluded_columns(
423
- embedding_features: EmbeddingFeatures,
424
- unseen_excluded_column_names: Set[str],
425
- schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
426
- column_name_to_include: Dict[str, bool],
427
- unseen_column_names: Set[str],
428
- ) -> None:
429
- """
430
- Check embedding features for excluded column names.
431
- """
432
- included_embedding_features: EmbeddingFeatures = {}
433
- for (
434
- embedding_feature_name,
435
- embedding_column_name_mapping,
436
- ) in embedding_features.items():
437
- include_embedding_feature = embedding_feature_name not in unseen_excluded_column_names
438
- if include_embedding_feature:
439
- included_embedding_features[embedding_feature_name] = deepcopy(
440
- embedding_column_name_mapping
441
- )
442
- else:
443
- unseen_excluded_column_names.discard(embedding_feature_name)
444
-
445
- for embedding_field in fields(embedding_column_name_mapping):
446
- column_name: Optional[str] = getattr(
447
- embedding_column_name_mapping, embedding_field.name
448
- )
449
- if column_name is not None:
450
- column_name_to_include[column_name] = include_embedding_feature
451
- if (
452
- column_name != embedding_feature_name
453
- and column_name in unseen_excluded_column_names
454
- ):
455
- logger.warning(
456
- f"Excluding embedding feature columns such as "
457
- f'"{column_name}" has no effect; instead exclude the '
458
- f'corresponding embedding feature name "{embedding_feature_name}".'
459
- )
460
- unseen_excluded_column_names.discard(column_name)
461
- unseen_column_names.discard(column_name)
462
- schema_patch["embedding_feature_column_names"] = (
463
- included_embedding_features if included_embedding_features else None
464
- )
465
-
466
-
467
- def _check_embedding_column_names_for_excluded_columns(
468
- embedding_column_name_mapping: EmbeddingColumnNames,
469
- column_name_to_include: Dict[str, bool],
470
- unseen_column_names: Set[str],
471
- ) -> None:
472
- """
473
- Check embedding column names for excluded column names.
474
- """
475
- for embedding_field in fields(embedding_column_name_mapping):
476
- column_name: Optional[str] = getattr(embedding_column_name_mapping, embedding_field.name)
477
- if column_name is not None:
478
- column_name_to_include[column_name] = True
479
- unseen_column_names.discard(column_name)
480
-
481
-
482
- def _discover_feature_columns(
483
- dataframe: DataFrame,
484
- unseen_excluded_column_names: Set[str],
485
- schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
486
- column_name_to_include: Dict[str, bool],
487
- unseen_column_names: Set[str],
488
- ) -> None:
489
- """
490
- Adds unseen and un-excluded columns as features, with the exception of "prediction_id"
491
- which is reserved
492
- """
493
- discovered_feature_column_names = []
494
- for column_name in unseen_column_names:
495
- if column_name not in unseen_excluded_column_names and column_name != "prediction_id":
496
- discovered_feature_column_names.append(column_name)
497
- column_name_to_include[column_name] = True
498
- else:
499
- unseen_excluded_column_names.discard(column_name)
500
- logger.debug(f"excluded feature: {column_name}")
501
- original_column_positions: List[int] = dataframe.columns.get_indexer(
502
- discovered_feature_column_names
503
- ) # type: ignore
504
- feature_column_name_to_position: Dict[str, int] = dict(
505
- zip(discovered_feature_column_names, original_column_positions)
506
- )
507
- discovered_feature_column_names.sort(key=lambda col: feature_column_name_to_position[col])
508
- schema_patch["feature_column_names"] = discovered_feature_column_names
509
- logger.debug(
510
- "Discovered feature column names: {}".format(", ".join(discovered_feature_column_names))
511
- )
512
-
513
-
514
- def _create_and_normalize_dataframe_and_schema(
515
- dataframe: DataFrame,
516
- schema: Schema,
517
- schema_patch: Dict[SchemaFieldName, SchemaFieldValue],
518
- column_name_to_include: Dict[str, bool],
519
- ) -> Tuple[DataFrame, Schema]:
520
- """
521
- Creates new dataframe and schema objects to reflect excluded column names
522
- and discovered features. This also normalizes dataframe columns to ensure a
523
- standard set of columns (i.e. timestamp and prediction_id) and datatypes for
524
- those columns.
525
- """
526
- included_column_names: List[str] = []
527
- for column_name in dataframe.columns:
528
- if column_name_to_include.get(str(column_name), False):
529
- included_column_names.append(str(column_name))
530
- parsed_dataframe = dataframe[included_column_names].copy()
531
- parsed_schema = replace(schema, excluded_column_names=None, **schema_patch) # type: ignore
532
- pred_id_col_name = parsed_schema.prediction_id_column_name
533
- if pred_id_col_name is None:
534
- parsed_schema = replace(parsed_schema, prediction_id_column_name="prediction_id")
535
- parsed_dataframe["prediction_id"] = _add_prediction_id(len(parsed_dataframe))
536
- elif is_numeric_dtype(parsed_dataframe.dtypes[pred_id_col_name]):
537
- parsed_dataframe[pred_id_col_name] = parsed_dataframe[pred_id_col_name].astype(str)
538
- for embedding in (
539
- [parsed_schema.prompt_column_names, parsed_schema.response_column_names]
540
- + list(parsed_schema.embedding_feature_column_names.values())
541
- if parsed_schema.embedding_feature_column_names is not None
542
- else []
543
- ):
544
- if not isinstance(embedding, EmbeddingColumnNames):
545
- continue
546
- vector_column_name = embedding.vector_column_name
547
- if vector_column_name not in parsed_dataframe.columns:
548
- continue
549
- parsed_dataframe.loc[:, vector_column_name] = _coerce_vectors_as_arrays_if_necessary(
550
- parsed_dataframe.loc[:, vector_column_name],
551
- vector_column_name,
552
- )
553
- return parsed_dataframe, parsed_schema
554
-
555
-
556
- def _coerce_vectors_as_arrays_if_necessary(
557
- series: "pd.Series[Any]",
558
- column_name: str,
559
- ) -> "pd.Series[Any]":
560
- not_na = ~series.isna()
561
- if not_na.sum() == 0:
562
- return series
563
- if invalid_types := set(map(type, series.loc[not_na])) - {np.ndarray}:
564
- logger.warning(
565
- f"converting items in column `{column_name}` to numpy.ndarray, "
566
- f"because they have the following "
567
- f"type{'s' if len(invalid_types) > 1 else ''}: "
568
- f"{', '.join(map(lambda t: t.__name__, invalid_types))}"
569
- )
570
- return series.mask(not_na, series.loc[not_na].apply(np.array))
571
- return series
572
-
573
-
574
- def _sort_dataframe_rows_by_timestamp(dataframe: DataFrame, schema: Schema) -> DataFrame:
575
- """
576
- Sorts dataframe rows by timestamp.
577
- """
578
- timestamp_column_name = schema.timestamp_column_name
579
- if timestamp_column_name is None:
580
- raise ValueError("Schema must specify a timestamp column name.")
581
- dataframe.set_index(timestamp_column_name, drop=False, inplace=True)
582
- dataframe.sort_index(inplace=True)
583
- return dataframe
584
-
585
-
586
- def _normalize_timestamps(
587
- dataframe: DataFrame,
588
- schema: Schema,
589
- default_timestamp: Timestamp,
590
- ) -> Tuple[DataFrame, Schema]:
591
- """
592
- Ensures that the dataframe has a timestamp column and the schema has a timestamp field. If the
593
- input dataframe contains a Unix or datetime timestamp or ISO8601 timestamp strings column, it
594
- is converted to UTC timezone-aware timestamp. If the input dataframe and schema do not contain
595
- timestamps, the default timestamp is used. If a timestamp is timezone-naive, it is localized
596
- as per local timezone and then converted to UTC
597
- """
598
- timestamp_column: Series[Timestamp]
599
- if (timestamp_column_name := schema.timestamp_column_name) is None:
600
- timestamp_column_name = "timestamp"
601
- schema = replace(schema, timestamp_column_name=timestamp_column_name)
602
- timestamp_column = (
603
- Series([default_timestamp] * len(dataframe), index=dataframe.index)
604
- if len(dataframe)
605
- else Series([default_timestamp]).iloc[:0].set_axis(dataframe.index, axis=0)
606
- )
607
- else:
608
- timestamp_column = normalize_timestamps(
609
- dataframe[timestamp_column_name],
610
- )
611
- dataframe[timestamp_column_name] = timestamp_column
612
- return dataframe, schema
613
-
614
-
615
- def _get_schema_from_unknown_schema_param(schemaLike: SchemaLike) -> Schema:
616
- """
617
- Compatibility function for converting from arize.utils.types.Schema to phoenix.datasets.Schema
618
- """
619
- try:
620
- from arize.utils.types import (
621
- EmbeddingColumnNames as ArizeEmbeddingColumnNames, # fmt: off type: ignore
622
- )
623
- from arize.utils.types import Schema as ArizeSchema
624
-
625
- if not isinstance(schemaLike, ArizeSchema):
626
- raise ValueError("Unknown schema passed to Dataset. Please pass a phoenix Schema")
627
-
628
- embedding_feature_column_names: Dict[str, EmbeddingColumnNames] = {}
629
- if schemaLike.embedding_feature_column_names is not None:
630
- for (
631
- embedding_name,
632
- arize_embedding_feature_column_names,
633
- ) in schemaLike.embedding_feature_column_names.items():
634
- if isinstance(arize_embedding_feature_column_names, ArizeEmbeddingColumnNames):
635
- embedding_feature_column_names[embedding_name] = EmbeddingColumnNames(
636
- vector_column_name=arize_embedding_feature_column_names.vector_column_name,
637
- link_to_data_column_name=arize_embedding_feature_column_names.link_to_data_column_name,
638
- raw_data_column_name=arize_embedding_feature_column_names.data_column_name,
639
- )
640
- prompt_column_names: Optional[EmbeddingColumnNames] = None
641
- if schemaLike.prompt_column_names is not None and isinstance(
642
- schemaLike.prompt_column_names, ArizeEmbeddingColumnNames
643
- ):
644
- prompt_column_names = EmbeddingColumnNames(
645
- vector_column_name=schemaLike.prompt_column_names.vector_column_name,
646
- raw_data_column_name=schemaLike.prompt_column_names.data_column_name,
647
- link_to_data_column_name=schemaLike.prompt_column_names.link_to_data_column_name,
648
- )
649
- response_column_names: Optional[EmbeddingColumnNames] = None
650
- if schemaLike.response_column_names is not None and isinstance(
651
- schemaLike.response_column_names, ArizeEmbeddingColumnNames
652
- ):
653
- response_column_names = EmbeddingColumnNames(
654
- vector_column_name=schemaLike.response_column_names.vector_column_name,
655
- raw_data_column_name=schemaLike.response_column_names.data_column_name,
656
- link_to_data_column_name=schemaLike.response_column_names.link_to_data_column_name,
657
- )
658
- return Schema(
659
- feature_column_names=schemaLike.feature_column_names,
660
- tag_column_names=schemaLike.tag_column_names,
661
- prediction_label_column_name=schemaLike.prediction_label_column_name,
662
- actual_label_column_name=schemaLike.actual_label_column_name,
663
- prediction_id_column_name=schemaLike.prediction_id_column_name,
664
- timestamp_column_name=schemaLike.timestamp_column_name,
665
- embedding_feature_column_names=embedding_feature_column_names,
666
- prompt_column_names=prompt_column_names,
667
- response_column_names=response_column_names,
668
- )
669
- except Exception:
670
- raise ValueError(
671
- """Unsupported Arize Schema. Please pass a phoenix Schema or update
672
- to the latest version of Arize."""
673
- )
674
-
675
-
676
- def _add_prediction_id(num_rows: int) -> List[str]:
677
- return [str(uuid.uuid4()) for _ in range(num_rows)]
678
-
679
171
 
680
172
  class OpenInferenceCategory(Enum):
681
173
  id = "id"
@@ -720,7 +212,3 @@ def _parse_open_inference_column_name(column_name: str) -> _OpenInferenceColumnN
720
212
  name=extract.get("name", ""),
721
213
  )
722
214
  raise ValueError(f"Invalid format for column name: {column_name}")
723
-
724
-
725
- # A dataset with no data. Useful for stubs
726
- EMPTY_DATASET = Dataset(pd.DataFrame(), schema=Schema())