arize 8.0.0b1__py3-none-any.whl → 8.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +9 -2
- arize/_client_factory.py +50 -0
- arize/_exporter/client.py +18 -17
- arize/_exporter/parsers/tracing_data_parser.py +9 -4
- arize/_exporter/validation.py +1 -1
- arize/_flight/client.py +37 -17
- arize/_generated/api_client/api/datasets_api.py +6 -6
- arize/_generated/api_client/api/experiments_api.py +6 -6
- arize/_generated/api_client/api/projects_api.py +3 -3
- arize/_lazy.py +61 -10
- arize/client.py +66 -50
- arize/config.py +175 -48
- arize/constants/config.py +1 -0
- arize/constants/ml.py +9 -16
- arize/constants/spans.py +5 -10
- arize/datasets/client.py +45 -28
- arize/datasets/errors.py +1 -1
- arize/datasets/validation.py +2 -2
- arize/embeddings/auto_generator.py +16 -9
- arize/embeddings/base_generators.py +15 -9
- arize/embeddings/cv_generators.py +2 -2
- arize/embeddings/errors.py +2 -2
- arize/embeddings/nlp_generators.py +8 -8
- arize/embeddings/tabular_generators.py +6 -6
- arize/exceptions/base.py +0 -52
- arize/exceptions/config.py +22 -0
- arize/exceptions/parameters.py +1 -330
- arize/exceptions/values.py +8 -5
- arize/experiments/__init__.py +4 -0
- arize/experiments/client.py +31 -18
- arize/experiments/evaluators/base.py +12 -9
- arize/experiments/evaluators/executors.py +16 -7
- arize/experiments/evaluators/rate_limiters.py +3 -1
- arize/experiments/evaluators/types.py +9 -7
- arize/experiments/evaluators/utils.py +7 -5
- arize/experiments/functions.py +128 -58
- arize/experiments/tracing.py +4 -1
- arize/experiments/types.py +34 -31
- arize/logging.py +54 -33
- arize/ml/batch_validation/errors.py +10 -1004
- arize/ml/batch_validation/validator.py +351 -291
- arize/ml/bounded_executor.py +25 -6
- arize/ml/casting.py +51 -33
- arize/ml/client.py +43 -35
- arize/ml/proto.py +21 -22
- arize/ml/stream_validation.py +64 -27
- arize/ml/surrogate_explainer/mimic.py +18 -10
- arize/ml/types.py +27 -67
- arize/pre_releases.py +10 -6
- arize/projects/client.py +9 -4
- arize/py.typed +0 -0
- arize/regions.py +11 -11
- arize/spans/client.py +125 -31
- arize/spans/columns.py +32 -36
- arize/spans/conversion.py +12 -11
- arize/spans/validation/annotations/dataframe_form_validation.py +1 -1
- arize/spans/validation/annotations/value_validation.py +11 -14
- arize/spans/validation/common/argument_validation.py +3 -3
- arize/spans/validation/common/dataframe_form_validation.py +7 -7
- arize/spans/validation/common/value_validation.py +11 -14
- arize/spans/validation/evals/dataframe_form_validation.py +4 -4
- arize/spans/validation/evals/evals_validation.py +6 -6
- arize/spans/validation/evals/value_validation.py +1 -1
- arize/spans/validation/metadata/argument_validation.py +1 -1
- arize/spans/validation/metadata/dataframe_form_validation.py +2 -2
- arize/spans/validation/metadata/value_validation.py +23 -1
- arize/spans/validation/spans/dataframe_form_validation.py +2 -2
- arize/spans/validation/spans/spans_validation.py +6 -6
- arize/utils/arrow.py +38 -2
- arize/utils/cache.py +2 -2
- arize/utils/dataframe.py +4 -4
- arize/utils/online_tasks/dataframe_preprocessor.py +15 -11
- arize/utils/openinference_conversion.py +10 -10
- arize/utils/proto.py +0 -1
- arize/utils/types.py +6 -6
- arize/version.py +1 -1
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/METADATA +32 -7
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/RECORD +81 -78
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/WHEEL +0 -0
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/LICENSE +0 -0
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/NOTICE +0 -0
|
@@ -11,6 +11,7 @@ from arize.constants.ml import (
|
|
|
11
11
|
MAX_FUTURE_YEARS_FROM_CURRENT_TIME,
|
|
12
12
|
MAX_PAST_YEARS_FROM_CURRENT_TIME,
|
|
13
13
|
)
|
|
14
|
+
from arize.exceptions.base import ValidationError
|
|
14
15
|
from arize.exceptions.parameters import InvalidModelVersion, InvalidProjectName
|
|
15
16
|
from arize.spans.columns import (
|
|
16
17
|
SPAN_END_TIME_COL,
|
|
@@ -56,7 +57,7 @@ def check_invalid_model_version(
|
|
|
56
57
|
model_version: The optional model version to validate.
|
|
57
58
|
|
|
58
59
|
Returns:
|
|
59
|
-
List of validation errors if model version is invalid (empty if valid or None).
|
|
60
|
+
List of validation errors if model version is invalid (empty if valid or :obj:`None`).
|
|
60
61
|
"""
|
|
61
62
|
if model_version is None:
|
|
62
63
|
return []
|
|
@@ -73,7 +74,7 @@ def check_string_column_value_length(
|
|
|
73
74
|
max_len: int,
|
|
74
75
|
is_required: bool,
|
|
75
76
|
must_be_json: bool = False,
|
|
76
|
-
) -> list[
|
|
77
|
+
) -> list[ValidationError]:
|
|
77
78
|
"""Validate string column values are within length bounds and optionally valid JSON.
|
|
78
79
|
|
|
79
80
|
Args:
|
|
@@ -90,7 +91,7 @@ def check_string_column_value_length(
|
|
|
90
91
|
if col_name not in df.columns:
|
|
91
92
|
return []
|
|
92
93
|
|
|
93
|
-
errors = []
|
|
94
|
+
errors: list[ValidationError] = []
|
|
94
95
|
if is_required and df[col_name].isnull().any():
|
|
95
96
|
errors.append(
|
|
96
97
|
InvalidMissingValueInColumn(
|
|
@@ -129,7 +130,7 @@ def check_string_column_allowed_values(
|
|
|
129
130
|
col_name: str,
|
|
130
131
|
allowed_values: list[str],
|
|
131
132
|
is_required: bool,
|
|
132
|
-
) -> list[
|
|
133
|
+
) -> list[ValidationError]:
|
|
133
134
|
"""Validate that string column values are within allowed values.
|
|
134
135
|
|
|
135
136
|
Args:
|
|
@@ -144,7 +145,7 @@ def check_string_column_allowed_values(
|
|
|
144
145
|
if col_name not in df.columns:
|
|
145
146
|
return []
|
|
146
147
|
|
|
147
|
-
errors = []
|
|
148
|
+
errors: list[ValidationError] = []
|
|
148
149
|
if is_required and df[col_name].isnull().any():
|
|
149
150
|
errors.append(
|
|
150
151
|
InvalidMissingValueInColumn(
|
|
@@ -177,7 +178,7 @@ def check_string_column_allowed_values(
|
|
|
177
178
|
def check_float_column_valid_numbers(
|
|
178
179
|
df: pd.DataFrame,
|
|
179
180
|
col_name: str,
|
|
180
|
-
) -> list[
|
|
181
|
+
) -> list[ValidationError]:
|
|
181
182
|
"""Check that float column contains only finite numbers, no infinity values.
|
|
182
183
|
|
|
183
184
|
Args:
|
|
@@ -201,11 +202,7 @@ def check_float_column_valid_numbers(
|
|
|
201
202
|
|
|
202
203
|
def check_value_columns_start_end_time(
|
|
203
204
|
df: pd.DataFrame,
|
|
204
|
-
) -> list[
|
|
205
|
-
InvalidMissingValueInColumn
|
|
206
|
-
| InvalidTimestampValueInColumn
|
|
207
|
-
| InvalidStartAndEndTimeValuesInColumn
|
|
208
|
-
]:
|
|
205
|
+
) -> list[ValidationError]:
|
|
209
206
|
"""Validate start and end time columns for timestamps and logical ordering.
|
|
210
207
|
|
|
211
208
|
Args:
|
|
@@ -214,7 +211,7 @@ def check_value_columns_start_end_time(
|
|
|
214
211
|
Returns:
|
|
215
212
|
List of validation errors for missing values, invalid timestamps, or start > end.
|
|
216
213
|
"""
|
|
217
|
-
errors = []
|
|
214
|
+
errors: list[ValidationError] = []
|
|
218
215
|
errors += check_value_timestamp(
|
|
219
216
|
df=df,
|
|
220
217
|
col_name=SPAN_START_TIME_COL.name,
|
|
@@ -243,7 +240,7 @@ def check_value_timestamp(
|
|
|
243
240
|
df: pd.DataFrame,
|
|
244
241
|
col_name: str,
|
|
245
242
|
is_required: bool,
|
|
246
|
-
) -> list[
|
|
243
|
+
) -> list[ValidationError]:
|
|
247
244
|
"""Validate timestamp column values are within reasonable bounds.
|
|
248
245
|
|
|
249
246
|
Args:
|
|
@@ -258,7 +255,7 @@ def check_value_timestamp(
|
|
|
258
255
|
if col_name not in df.columns:
|
|
259
256
|
return []
|
|
260
257
|
|
|
261
|
-
errors = []
|
|
258
|
+
errors: list[ValidationError] = []
|
|
262
259
|
if is_required and df[col_name].isnull().any():
|
|
263
260
|
errors.append(
|
|
264
261
|
InvalidMissingValueInColumn(
|
|
@@ -27,10 +27,10 @@ def log_info_dataframe_extra_column_names(
|
|
|
27
27
|
"""Logs informational message about columns that don't follow evaluation naming conventions.
|
|
28
28
|
|
|
29
29
|
Args:
|
|
30
|
-
df: DataFrame to check for extra column names, or None
|
|
30
|
+
df: DataFrame to check for extra column names, or :obj:`None`.
|
|
31
31
|
|
|
32
32
|
Returns:
|
|
33
|
-
None
|
|
33
|
+
:obj:`None`.
|
|
34
34
|
"""
|
|
35
35
|
if df is None:
|
|
36
36
|
return
|
|
@@ -57,13 +57,13 @@ def log_info_dataframe_extra_column_names(
|
|
|
57
57
|
def check_dataframe_column_content_type(
|
|
58
58
|
df: pd.DataFrame,
|
|
59
59
|
) -> list[InvalidDataFrameColumnContentTypes]:
|
|
60
|
-
"""Validates that evaluation DataFrame columns contain expected data types.
|
|
60
|
+
"""Validates that evaluation :class:`pandas.DataFrame` columns contain expected data types.
|
|
61
61
|
|
|
62
62
|
Checks that label columns contain strings, score columns contain numbers,
|
|
63
63
|
and explanation columns contain strings.
|
|
64
64
|
|
|
65
65
|
Args:
|
|
66
|
-
df: The DataFrame to validate.
|
|
66
|
+
df: The :class:`pandas.DataFrame` to validate.
|
|
67
67
|
|
|
68
68
|
Returns:
|
|
69
69
|
List of validation errors for columns with incorrect types.
|
|
@@ -55,13 +55,13 @@ def validate_argument_types(
|
|
|
55
55
|
def validate_dataframe_form(
|
|
56
56
|
evals_dataframe: pd.DataFrame,
|
|
57
57
|
) -> list[ValidationError]:
|
|
58
|
-
"""Validate the structure and form of an evaluations DataFrame
|
|
58
|
+
"""Validate the structure and form of an evaluations :class:`pandas.DataFrame`.
|
|
59
59
|
|
|
60
60
|
Args:
|
|
61
|
-
evals_dataframe: The DataFrame containing evaluation data to validate.
|
|
61
|
+
evals_dataframe: The :class:`pandas.DataFrame` containing evaluation data to validate.
|
|
62
62
|
|
|
63
63
|
Returns:
|
|
64
|
-
List of validation errors found in the DataFrame structure.
|
|
64
|
+
List of validation errors found in the :class:`pandas.DataFrame` structure.
|
|
65
65
|
"""
|
|
66
66
|
df_validation.log_info_dataframe_extra_column_names(evals_dataframe)
|
|
67
67
|
checks = chain(
|
|
@@ -84,15 +84,15 @@ def validate_values(
|
|
|
84
84
|
project_name: str,
|
|
85
85
|
model_version: str | None = None,
|
|
86
86
|
) -> list[ValidationError]:
|
|
87
|
-
"""Validate the values within an evaluations DataFrame
|
|
87
|
+
"""Validate the values within an evaluations :class:`pandas.DataFrame`.
|
|
88
88
|
|
|
89
89
|
Args:
|
|
90
|
-
evals_dataframe: The DataFrame containing evaluation data to validate.
|
|
90
|
+
evals_dataframe: The :class:`pandas.DataFrame` containing evaluation data to validate.
|
|
91
91
|
project_name: The project name associated with the evaluations.
|
|
92
92
|
model_version: Optional model version. Defaults to None.
|
|
93
93
|
|
|
94
94
|
Returns:
|
|
95
|
-
List of validation errors found in DataFrame values.
|
|
95
|
+
List of validation errors found in :class:`pandas.DataFrame` values.
|
|
96
96
|
"""
|
|
97
97
|
checks = chain(
|
|
98
98
|
# Common
|
|
@@ -7,7 +7,7 @@ from arize.spans.columns import SPAN_SPAN_ID_COL
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class MetadataFormError(ValidationError):
|
|
10
|
-
"""Raised when metadata DataFrame structure or format is invalid."""
|
|
10
|
+
"""Raised when metadata :class:`pandas.DataFrame` structure or format is invalid."""
|
|
11
11
|
|
|
12
12
|
def __init__(self, message: str, resolution: str) -> None:
|
|
13
13
|
"""Initialize the exception with metadata form error context.
|
|
@@ -41,7 +41,7 @@ def validate_dataframe_form(
|
|
|
41
41
|
Returns:
|
|
42
42
|
A list of validation errors, empty if none found
|
|
43
43
|
"""
|
|
44
|
-
errors = []
|
|
44
|
+
errors: list[ValidationError] = []
|
|
45
45
|
|
|
46
46
|
# Check for empty dataframe
|
|
47
47
|
if metadata_dataframe.empty:
|
|
@@ -34,6 +34,28 @@ class MetadataValueError(ValidationError):
|
|
|
34
34
|
return f"{self.message} {self.resolution}"
|
|
35
35
|
|
|
36
36
|
|
|
37
|
+
class InvalidPatchDocumentFormat(ValidationError):
|
|
38
|
+
"""Raised when patch document format is invalid or cannot be parsed."""
|
|
39
|
+
|
|
40
|
+
def __init__(self, row_idx: int, message: str) -> None:
|
|
41
|
+
"""Initialize the exception with patch document format error context.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
row_idx: The row index where the invalid patch was found.
|
|
45
|
+
message: Detailed error message describing the format issue.
|
|
46
|
+
"""
|
|
47
|
+
self.row_idx = row_idx
|
|
48
|
+
self.message = message
|
|
49
|
+
|
|
50
|
+
def __repr__(self) -> str:
|
|
51
|
+
"""Return a string representation for debugging and logging."""
|
|
52
|
+
return "Invalid_Patch_Document_Format"
|
|
53
|
+
|
|
54
|
+
def error_message(self) -> str:
|
|
55
|
+
"""Return the error message for this exception."""
|
|
56
|
+
return f"Row {self.row_idx}: {self.message}"
|
|
57
|
+
|
|
58
|
+
|
|
37
59
|
def calculate_json_depth(obj: object, current_depth: int = 1) -> int:
|
|
38
60
|
"""Calculate the maximum nesting depth of a JSON object.
|
|
39
61
|
|
|
@@ -67,7 +89,7 @@ def validate_values(
|
|
|
67
89
|
Returns:
|
|
68
90
|
A list of validation errors, empty if none found
|
|
69
91
|
"""
|
|
70
|
-
errors = []
|
|
92
|
+
errors: list[ValidationError] = []
|
|
71
93
|
|
|
72
94
|
# Skip validation if span_id column is not present
|
|
73
95
|
if SPAN_SPAN_ID_COL.name not in metadata_dataframe.columns:
|
|
@@ -50,13 +50,13 @@ def log_info_dataframe_extra_column_names(
|
|
|
50
50
|
def check_dataframe_column_content_type(
|
|
51
51
|
df: pd.DataFrame,
|
|
52
52
|
) -> list[InvalidDataFrameColumnContentTypes]:
|
|
53
|
-
"""Validates
|
|
53
|
+
"""Validates span :class:`pandas.DataFrame` columns match OpenInference types.
|
|
54
54
|
|
|
55
55
|
Checks that columns have appropriate data types: lists of dicts, dicts, numeric,
|
|
56
56
|
boolean, timestamp, JSON strings, or plain strings based on column specifications.
|
|
57
57
|
|
|
58
58
|
Args:
|
|
59
|
-
df: The DataFrame to validate.
|
|
59
|
+
df: The :class:`pandas.DataFrame` to validate.
|
|
60
60
|
|
|
61
61
|
Returns:
|
|
62
62
|
List of validation errors for columns with incorrect types.
|
|
@@ -56,13 +56,13 @@ def validate_argument_types(
|
|
|
56
56
|
def validate_dataframe_form(
|
|
57
57
|
spans_dataframe: pd.DataFrame,
|
|
58
58
|
) -> list[ValidationError]:
|
|
59
|
-
"""Validate the structure and form of a spans DataFrame
|
|
59
|
+
"""Validate the structure and form of a spans :class:`pandas.DataFrame`.
|
|
60
60
|
|
|
61
61
|
Args:
|
|
62
|
-
spans_dataframe: The DataFrame containing spans data to validate.
|
|
62
|
+
spans_dataframe: The :class:`pandas.DataFrame` containing spans data to validate.
|
|
63
63
|
|
|
64
64
|
Returns:
|
|
65
|
-
List of validation errors found in the DataFrame structure.
|
|
65
|
+
List of validation errors found in the :class:`pandas.DataFrame` structure.
|
|
66
66
|
"""
|
|
67
67
|
df_validation.log_info_dataframe_extra_column_names(spans_dataframe)
|
|
68
68
|
checks = chain(
|
|
@@ -88,15 +88,15 @@ def validate_values(
|
|
|
88
88
|
project_name: str,
|
|
89
89
|
model_version: str | None = None,
|
|
90
90
|
) -> list[ValidationError]:
|
|
91
|
-
"""Validate the values within a spans DataFrame
|
|
91
|
+
"""Validate the values within a spans :class:`pandas.DataFrame`.
|
|
92
92
|
|
|
93
93
|
Args:
|
|
94
|
-
spans_dataframe: The DataFrame containing spans data to validate.
|
|
94
|
+
spans_dataframe: The :class:`pandas.DataFrame` containing spans data to validate.
|
|
95
95
|
project_name: The project name associated with the spans.
|
|
96
96
|
model_version: Optional model version. Defaults to None.
|
|
97
97
|
|
|
98
98
|
Returns:
|
|
99
|
-
List of validation errors found in DataFrame values.
|
|
99
|
+
List of validation errors found in :class:`pandas.DataFrame` values.
|
|
100
100
|
"""
|
|
101
101
|
checks = chain(
|
|
102
102
|
# Common
|
arize/utils/arrow.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Apache Arrow utilities for data serialization and file operations."""
|
|
2
2
|
|
|
3
|
-
# type: ignore[pb2]
|
|
4
3
|
from __future__ import annotations
|
|
5
4
|
|
|
6
5
|
import base64
|
|
@@ -38,7 +37,7 @@ def post_arrow_table(
|
|
|
38
37
|
pa_table: The PyArrow table containing the data.
|
|
39
38
|
proto_schema: The protobuf schema for the data.
|
|
40
39
|
headers: HTTP headers for the request.
|
|
41
|
-
timeout: Request timeout in seconds, or None for no timeout.
|
|
40
|
+
timeout: Request timeout in seconds, or :obj:`None` for no timeout.
|
|
42
41
|
verify: Whether to verify SSL certificates.
|
|
43
42
|
max_chunksize: Maximum chunk size for splitting large tables.
|
|
44
43
|
tmp_dir: Temporary directory for serialization. Defaults to "".
|
|
@@ -124,6 +123,18 @@ def post_arrow_table(
|
|
|
124
123
|
def _append_to_pyarrow_metadata(
|
|
125
124
|
pa_schema: pa.Schema, new_metadata: dict[str, Any]
|
|
126
125
|
) -> object:
|
|
126
|
+
"""Append metadata to a PyArrow schema without overwriting existing keys.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
pa_schema: The PyArrow schema to add metadata to.
|
|
130
|
+
new_metadata: Dictionary of metadata key-value pairs to append.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
pa.Schema: A new PyArrow schema with the merged metadata.
|
|
134
|
+
|
|
135
|
+
Raises:
|
|
136
|
+
KeyError: If any keys in new_metadata conflict with existing schema metadata.
|
|
137
|
+
"""
|
|
127
138
|
# Ensure metadata is handled correctly, even if initially None.
|
|
128
139
|
metadata = pa_schema.metadata
|
|
129
140
|
if metadata is None:
|
|
@@ -145,6 +156,14 @@ def _append_to_pyarrow_metadata(
|
|
|
145
156
|
def _write_arrow_file(
|
|
146
157
|
path: str, pa_table: pa.Table, pa_schema: pa.Schema, max_chunksize: int
|
|
147
158
|
) -> None:
|
|
159
|
+
"""Write a PyArrow table to an Arrow IPC file with specified schema and chunk size.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
path: The file path where the Arrow file will be written.
|
|
163
|
+
pa_table: The PyArrow table containing the data to write.
|
|
164
|
+
pa_schema: The PyArrow schema to use for the file.
|
|
165
|
+
max_chunksize: Maximum number of rows per record batch chunk.
|
|
166
|
+
"""
|
|
148
167
|
with (
|
|
149
168
|
pa.OSFile(path, mode="wb") as sink,
|
|
150
169
|
pa.ipc.RecordBatchStreamWriter(sink, pa_schema) as writer,
|
|
@@ -153,6 +172,15 @@ def _write_arrow_file(
|
|
|
153
172
|
|
|
154
173
|
|
|
155
174
|
def _maybe_log_project_url(response: requests.Response) -> None:
|
|
175
|
+
"""Attempt to extract and log the Arize project URL from an HTTP response.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
response: The HTTP response object from an Arize API request.
|
|
179
|
+
|
|
180
|
+
Notes:
|
|
181
|
+
Logs success message with URL if extraction succeeds, or warning if it fails.
|
|
182
|
+
This function never raises exceptions.
|
|
183
|
+
"""
|
|
156
184
|
try:
|
|
157
185
|
url = get_arize_project_url(response)
|
|
158
186
|
if url:
|
|
@@ -176,6 +204,14 @@ def _mktemp_in(directory: str) -> str:
|
|
|
176
204
|
|
|
177
205
|
|
|
178
206
|
def _filesize(path: str) -> int:
|
|
207
|
+
"""Get the size of a file in bytes.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
path: The file path to check.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
int: The file size in bytes, or -1 if the file cannot be accessed.
|
|
214
|
+
"""
|
|
179
215
|
try:
|
|
180
216
|
return os.path.getsize(path)
|
|
181
217
|
except Exception:
|
arize/utils/cache.py
CHANGED
|
@@ -31,7 +31,7 @@ def load_cached_resource(
|
|
|
31
31
|
format: File format for cached data. Defaults to "parquet".
|
|
32
32
|
|
|
33
33
|
Returns:
|
|
34
|
-
The cached DataFrame if found and valid, None otherwise.
|
|
34
|
+
The cached :class:`pandas.DataFrame` if found and valid, :obj:`None` otherwise.
|
|
35
35
|
"""
|
|
36
36
|
key = _get_cache_key(resource, resource_id, resource_updated_at)
|
|
37
37
|
filepath = _get_abs_file_path(cache_dir, f"{key}.{format}", resource)
|
|
@@ -59,7 +59,7 @@ def cache_resource(
|
|
|
59
59
|
resource: Resource type name (e.g., "dataset", "experiment").
|
|
60
60
|
resource_id: Unique identifier for the resource.
|
|
61
61
|
resource_updated_at: Optional timestamp of last resource update.
|
|
62
|
-
resource_data: DataFrame containing the resource data.
|
|
62
|
+
resource_data: :class:`pandas.DataFrame` containing the resource data.
|
|
63
63
|
format: File format for cached data. Defaults to "parquet".
|
|
64
64
|
"""
|
|
65
65
|
key = _get_cache_key(resource, resource_id, resource_updated_at)
|
arize/utils/dataframe.py
CHANGED
|
@@ -9,10 +9,10 @@ from arize.ml.types import BaseSchema
|
|
|
9
9
|
|
|
10
10
|
# Resets the dataframe index if it is not a RangeIndex
|
|
11
11
|
def reset_dataframe_index(dataframe: pd.DataFrame) -> None:
|
|
12
|
-
"""Reset the DataFrame index in-place if it is not a RangeIndex.
|
|
12
|
+
"""Reset the :class:`pandas.DataFrame` index in-place if it is not a RangeIndex.
|
|
13
13
|
|
|
14
14
|
Args:
|
|
15
|
-
dataframe: The pandas
|
|
15
|
+
dataframe: The :class:`pandas.DataFrame` to reset.
|
|
16
16
|
"""
|
|
17
17
|
if not isinstance(dataframe.index, pd.RangeIndex):
|
|
18
18
|
drop = dataframe.index.name in dataframe.columns
|
|
@@ -25,10 +25,10 @@ def remove_extraneous_columns(
|
|
|
25
25
|
column_list: list[str] | None = None,
|
|
26
26
|
regex: str | None = None,
|
|
27
27
|
) -> pd.DataFrame:
|
|
28
|
-
"""Filter DataFrame to keep only relevant columns based on schema, list, or regex.
|
|
28
|
+
"""Filter :class:`pandas.DataFrame` to keep only relevant columns based on schema, list, or regex.
|
|
29
29
|
|
|
30
30
|
Args:
|
|
31
|
-
df: The pandas
|
|
31
|
+
df: The :class:`pandas.DataFrame` to filter.
|
|
32
32
|
schema: Optional schema defining used columns. Defaults to None.
|
|
33
33
|
column_list: Optional explicit list of columns to keep. Defaults to None.
|
|
34
34
|
regex: Optional regex pattern to match column names. Defaults to None.
|
|
@@ -10,7 +10,7 @@ logger = logging.getLogger(__name__)
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class ColumnNotFoundError(Exception):
|
|
13
|
-
"""Raised when a specified column is not found in the DataFrame
|
|
13
|
+
"""Raised when a specified column is not found in the :class:`pandas.DataFrame`."""
|
|
14
14
|
|
|
15
15
|
def __init__(self, attribute: str) -> None:
|
|
16
16
|
"""Initialize with the attribute that couldn't be mapped to a column.
|
|
@@ -27,13 +27,13 @@ class ColumnNotFoundError(Exception):
|
|
|
27
27
|
def extract_nested_data_to_column(
|
|
28
28
|
attributes: list[str], df: pd.DataFrame
|
|
29
29
|
) -> pd.DataFrame:
|
|
30
|
-
"""Extract nested attributes from complex data structures into new DataFrame columns.
|
|
30
|
+
"""Extract nested attributes from complex data structures into new :class:`pandas.DataFrame` columns.
|
|
31
31
|
|
|
32
32
|
This function, used in Online Tasks, is typically run on data exported from Arize.
|
|
33
|
-
It prepares the DataFrame by extracting relevant attributes from complex, deeply
|
|
33
|
+
It prepares the :class:`pandas.DataFrame` by extracting relevant attributes from complex, deeply
|
|
34
34
|
nested data structures, such as those found in LLM outputs or JSON-like records.
|
|
35
35
|
It helps extract specific values from these nested structures by identifying the
|
|
36
|
-
longest matching column name in the DataFrame and recursively accessing the desired
|
|
36
|
+
longest matching column name in the :class:`pandas.DataFrame` and recursively accessing the desired
|
|
37
37
|
attribute path within each row. This preprocessing step ensures that the extracted
|
|
38
38
|
values are available as new columns, allowing evaluators to process and assess
|
|
39
39
|
these values effectively.
|
|
@@ -81,9 +81,12 @@ def extract_nested_data_to_column(
|
|
|
81
81
|
remainder = ".".join(parts[prefix_len:])
|
|
82
82
|
|
|
83
83
|
# 3) Apply introspect row-by-row
|
|
84
|
+
# Type narrowing: prefix_col is guaranteed to be str after the None check above
|
|
85
|
+
prefix_col_str: str = prefix_col
|
|
86
|
+
|
|
84
87
|
def apply_introspect_arize_attribute(
|
|
85
|
-
row: pd.Series,
|
|
86
|
-
prefix_col: str =
|
|
88
|
+
row: pd.Series, # type: ignore[type-arg]
|
|
89
|
+
prefix_col: str = prefix_col_str,
|
|
87
90
|
remainder: str = remainder,
|
|
88
91
|
) -> object:
|
|
89
92
|
val = row[prefix_col]
|
|
@@ -94,8 +97,9 @@ def extract_nested_data_to_column(
|
|
|
94
97
|
else:
|
|
95
98
|
return result if result is not None else np.nan
|
|
96
99
|
|
|
97
|
-
result_df[attribute] = result_df.apply(
|
|
98
|
-
apply_introspect_arize_attribute,
|
|
100
|
+
result_df[attribute] = result_df.apply( # type: ignore[call-overload]
|
|
101
|
+
apply_introspect_arize_attribute,
|
|
102
|
+
axis=1,
|
|
99
103
|
)
|
|
100
104
|
|
|
101
105
|
new_cols.append(attribute)
|
|
@@ -127,7 +131,7 @@ def _introspect_arize_attribute(value: object, attribute: str) -> object:
|
|
|
127
131
|
attribute: "0.message.content"
|
|
128
132
|
Returns: 'The capital of China is Beijing.'
|
|
129
133
|
|
|
130
|
-
- Returns None immediately when a key or index is not found
|
|
134
|
+
- Returns :obj:`None` immediately when a key or index is not found
|
|
131
135
|
- Handles integer parts for lists
|
|
132
136
|
- Parses JSON strings
|
|
133
137
|
- Converts NumPy arrays to lists
|
|
@@ -174,10 +178,10 @@ def _parse_value(
|
|
|
174
178
|
2) Else if `current_value` is a dict, check if `attribute_parts_unprocessed[0]` is a key.
|
|
175
179
|
If not found, try combining `attribute_parts_unprocessed[0] + '.' + attribute_parts_unprocessed[1]`...
|
|
176
180
|
to handle dotted keys in the dict.
|
|
177
|
-
3) If none match, return (None
|
|
181
|
+
3) If none match, return (:obj:`None`, 1) to signal "not found, consume 1 part."
|
|
178
182
|
|
|
179
183
|
Returns (parsed_value, num_parts_processed):
|
|
180
|
-
- parsed_value: the found value or None if not found
|
|
184
|
+
- parsed_value: the found value or :obj:`None` if not found
|
|
181
185
|
- num_parts_processed: how many parts were processed (1 or more)
|
|
182
186
|
"""
|
|
183
187
|
if not attribute_parts_unprocessed:
|
|
@@ -11,13 +11,13 @@ logger = logging.getLogger(__name__)
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def convert_datetime_columns_to_int(df: pd.DataFrame) -> pd.DataFrame:
|
|
14
|
-
"""Convert datetime columns in a DataFrame to milliseconds since epoch.
|
|
14
|
+
"""Convert datetime columns in a :class:`pandas.DataFrame` to milliseconds since epoch.
|
|
15
15
|
|
|
16
16
|
Args:
|
|
17
|
-
df: The pandas
|
|
17
|
+
df: The :class:`pandas.DataFrame` to convert.
|
|
18
18
|
|
|
19
19
|
Returns:
|
|
20
|
-
The DataFrame with datetime columns converted to integers.
|
|
20
|
+
The :class:`pandas.DataFrame` with datetime columns converted to integers.
|
|
21
21
|
"""
|
|
22
22
|
for col in df.select_dtypes(
|
|
23
23
|
include=["datetime64[ns]", "datetime64[ns, UTC]"]
|
|
@@ -27,13 +27,13 @@ def convert_datetime_columns_to_int(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
def convert_boolean_columns_to_str(df: pd.DataFrame) -> pd.DataFrame:
|
|
30
|
-
"""Convert boolean columns in a DataFrame to string type.
|
|
30
|
+
"""Convert boolean columns in a :class:`pandas.DataFrame` to string type.
|
|
31
31
|
|
|
32
32
|
Args:
|
|
33
|
-
df: The pandas
|
|
33
|
+
df: The :class:`pandas.DataFrame` to convert.
|
|
34
34
|
|
|
35
35
|
Returns:
|
|
36
|
-
The DataFrame with boolean columns converted to strings.
|
|
36
|
+
The :class:`pandas.DataFrame` with boolean columns converted to strings.
|
|
37
37
|
"""
|
|
38
38
|
for col in df.columns:
|
|
39
39
|
if df[col].dtype == "bool":
|
|
@@ -45,10 +45,10 @@ def convert_default_columns_to_json_str(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
45
45
|
"""Convert dictionary values in specific columns to JSON strings.
|
|
46
46
|
|
|
47
47
|
Args:
|
|
48
|
-
df: The pandas
|
|
48
|
+
df: The :class:`pandas.DataFrame` to convert.
|
|
49
49
|
|
|
50
50
|
Returns:
|
|
51
|
-
The DataFrame with dictionaries in eligible columns converted to JSON strings.
|
|
51
|
+
The :class:`pandas.DataFrame` with dictionaries in eligible columns converted to JSON strings.
|
|
52
52
|
"""
|
|
53
53
|
for col in df.columns:
|
|
54
54
|
if _should_convert_json(col):
|
|
@@ -68,10 +68,10 @@ def convert_json_str_to_dict(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
68
68
|
"""Convert JSON string values in specific columns to Python dictionaries.
|
|
69
69
|
|
|
70
70
|
Args:
|
|
71
|
-
df: The pandas
|
|
71
|
+
df: The :class:`pandas.DataFrame` to convert.
|
|
72
72
|
|
|
73
73
|
Returns:
|
|
74
|
-
The DataFrame with JSON strings in eligible columns converted to dictionaries.
|
|
74
|
+
The :class:`pandas.DataFrame` with JSON strings in eligible columns converted to dictionaries.
|
|
75
75
|
"""
|
|
76
76
|
for col in df.columns:
|
|
77
77
|
if _should_convert_json(col):
|
arize/utils/proto.py
CHANGED
arize/utils/types.py
CHANGED
|
@@ -43,7 +43,7 @@ def is_array_of(arr: Sequence[object], tp: T) -> bool:
|
|
|
43
43
|
return isinstance(arr, np.ndarray) and all(isinstance(x, tp) for x in arr)
|
|
44
44
|
|
|
45
45
|
|
|
46
|
-
def is_list_of(lst:
|
|
46
|
+
def is_list_of(lst: object, tp: T) -> bool:
|
|
47
47
|
"""Check if a value is a list with all elements of a specific type.
|
|
48
48
|
|
|
49
49
|
Args:
|
|
@@ -70,10 +70,10 @@ def is_iterable_of(lst: Sequence[object], tp: T) -> bool:
|
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
def is_dict_of(
|
|
73
|
-
d:
|
|
74
|
-
key_allowed_types:
|
|
75
|
-
value_allowed_types:
|
|
76
|
-
value_list_allowed_types:
|
|
73
|
+
d: object,
|
|
74
|
+
key_allowed_types: type | tuple[type, ...],
|
|
75
|
+
value_allowed_types: type | tuple[type, ...] = (),
|
|
76
|
+
value_list_allowed_types: type | tuple[type, ...] = (),
|
|
77
77
|
) -> bool:
|
|
78
78
|
"""Method to check types are valid for dictionary.
|
|
79
79
|
|
|
@@ -98,7 +98,7 @@ def is_dict_of(
|
|
|
98
98
|
and all(isinstance(k, key_allowed_types) for k in d)
|
|
99
99
|
and all(
|
|
100
100
|
isinstance(v, value_allowed_types)
|
|
101
|
-
or any(is_list_of(v, t) for t in value_list_allowed_types)
|
|
101
|
+
or any(is_list_of(v, t) for t in value_list_allowed_types) # type: ignore[union-attr]
|
|
102
102
|
for v in d.values()
|
|
103
103
|
if value_allowed_types or value_list_allowed_types
|
|
104
104
|
)
|
arize/version.py
CHANGED