arize 8.0.0b2__py3-none-any.whl → 8.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +8 -1
- arize/_exporter/client.py +18 -17
- arize/_exporter/parsers/tracing_data_parser.py +9 -4
- arize/_exporter/validation.py +1 -1
- arize/_flight/client.py +33 -13
- arize/_lazy.py +37 -2
- arize/client.py +61 -35
- arize/config.py +168 -14
- arize/constants/config.py +1 -0
- arize/datasets/client.py +32 -19
- arize/embeddings/auto_generator.py +14 -7
- arize/embeddings/base_generators.py +15 -9
- arize/embeddings/cv_generators.py +2 -2
- arize/embeddings/nlp_generators.py +8 -8
- arize/embeddings/tabular_generators.py +5 -5
- arize/exceptions/config.py +22 -0
- arize/exceptions/parameters.py +1 -1
- arize/exceptions/values.py +8 -5
- arize/experiments/__init__.py +4 -0
- arize/experiments/client.py +17 -11
- arize/experiments/evaluators/base.py +6 -3
- arize/experiments/evaluators/executors.py +6 -4
- arize/experiments/evaluators/rate_limiters.py +3 -1
- arize/experiments/evaluators/types.py +7 -5
- arize/experiments/evaluators/utils.py +7 -5
- arize/experiments/functions.py +111 -48
- arize/experiments/tracing.py +4 -1
- arize/experiments/types.py +31 -26
- arize/logging.py +53 -32
- arize/ml/batch_validation/validator.py +82 -70
- arize/ml/bounded_executor.py +25 -6
- arize/ml/casting.py +45 -27
- arize/ml/client.py +35 -28
- arize/ml/proto.py +16 -17
- arize/ml/stream_validation.py +63 -25
- arize/ml/surrogate_explainer/mimic.py +15 -7
- arize/ml/types.py +26 -12
- arize/pre_releases.py +7 -6
- arize/py.typed +0 -0
- arize/regions.py +10 -10
- arize/spans/client.py +113 -21
- arize/spans/conversion.py +7 -5
- arize/spans/validation/annotations/dataframe_form_validation.py +1 -1
- arize/spans/validation/annotations/value_validation.py +11 -14
- arize/spans/validation/common/dataframe_form_validation.py +1 -1
- arize/spans/validation/common/value_validation.py +10 -13
- arize/spans/validation/evals/value_validation.py +1 -1
- arize/spans/validation/metadata/argument_validation.py +1 -1
- arize/spans/validation/metadata/dataframe_form_validation.py +1 -1
- arize/spans/validation/metadata/value_validation.py +23 -1
- arize/utils/arrow.py +37 -1
- arize/utils/online_tasks/dataframe_preprocessor.py +8 -4
- arize/utils/proto.py +0 -1
- arize/utils/types.py +6 -6
- arize/version.py +1 -1
- {arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/METADATA +18 -3
- {arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/RECORD +60 -58
- {arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/WHEEL +0 -0
- {arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/licenses/LICENSE +0 -0
- {arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/licenses/NOTICE +0 -0
arize/regions.py
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
"""Region definitions and configuration for Arize deployment zones."""
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from enum import
|
|
4
|
+
from enum import Enum
|
|
5
5
|
|
|
6
6
|
from arize.constants.config import DEFAULT_FLIGHT_PORT
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class Region(
|
|
9
|
+
class Region(Enum):
|
|
10
10
|
"""Enum representing available Arize deployment regions."""
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
12
|
+
CA_CENTRAL_1A = "ca-central-1a"
|
|
13
|
+
EU_WEST_1A = "eu-west-1a"
|
|
14
|
+
US_CENTRAL_1A = "us-central-1a"
|
|
15
|
+
US_EAST_1B = "us-east-1b"
|
|
16
16
|
UNSET = ""
|
|
17
17
|
|
|
18
18
|
|
|
@@ -28,13 +28,13 @@ class RegionEndpoints:
|
|
|
28
28
|
|
|
29
29
|
def _get_region_endpoints(region: Region) -> RegionEndpoints:
|
|
30
30
|
return RegionEndpoints(
|
|
31
|
-
api_host=f"api.{region}.arize.com",
|
|
32
|
-
otlp_host=f"otlp.{region}.arize.com",
|
|
33
|
-
flight_host=f"flight.{region}.arize.com",
|
|
31
|
+
api_host=f"api.{region.value}.arize.com",
|
|
32
|
+
otlp_host=f"otlp.{region.value}.arize.com",
|
|
33
|
+
flight_host=f"flight.{region.value}.arize.com",
|
|
34
34
|
flight_port=DEFAULT_FLIGHT_PORT,
|
|
35
35
|
)
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
REGION_ENDPOINTS: dict[Region, RegionEndpoints] = {
|
|
39
|
-
r: _get_region_endpoints(r) for r in Region if r != Region.UNSET
|
|
39
|
+
r: _get_region_endpoints(r) for r in list(Region) if r != Region.UNSET
|
|
40
40
|
}
|
arize/spans/client.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# type: ignore[pb2]
|
|
2
1
|
"""Client implementation for managing spans and traces in the Arize platform."""
|
|
3
2
|
|
|
4
3
|
from __future__ import annotations
|
|
@@ -21,12 +20,16 @@ from arize._flight.types import FlightRequestType
|
|
|
21
20
|
from arize.constants.spans import DEFAULT_DATETIME_FMT
|
|
22
21
|
from arize.exceptions.base import (
|
|
23
22
|
INVALID_ARROW_CONVERSION_MSG,
|
|
23
|
+
ValidationError,
|
|
24
24
|
ValidationFailure,
|
|
25
25
|
)
|
|
26
26
|
from arize.exceptions.models import MissingProjectNameError
|
|
27
27
|
from arize.exceptions.spaces import MissingSpaceIDError
|
|
28
28
|
from arize.logging import CtxAdapter
|
|
29
29
|
from arize.ml.types import Environments
|
|
30
|
+
from arize.spans.validation.metadata.value_validation import (
|
|
31
|
+
InvalidPatchDocumentFormat,
|
|
32
|
+
)
|
|
30
33
|
from arize.utils.arrow import post_arrow_table
|
|
31
34
|
from arize.utils.dataframe import (
|
|
32
35
|
remove_extraneous_columns,
|
|
@@ -850,7 +853,8 @@ class SpansClient:
|
|
|
850
853
|
)
|
|
851
854
|
for idx in range(len(metadata_df))
|
|
852
855
|
]
|
|
853
|
-
|
|
856
|
+
# Type ignore: pandas DataFrame column assignment type is overly restrictive
|
|
857
|
+
metadata_df[final_patch_column] = merged_patches # type: ignore[assignment]
|
|
854
858
|
else:
|
|
855
859
|
# Just use the field patches directly
|
|
856
860
|
metadata_df[final_patch_column] = field_patches
|
|
@@ -887,7 +891,8 @@ class SpansClient:
|
|
|
887
891
|
log.error(e)
|
|
888
892
|
raise ValidationFailure(validation_errors)
|
|
889
893
|
|
|
890
|
-
|
|
894
|
+
# Type ignore: pandas DataFrame column assignment type is overly restrictive
|
|
895
|
+
metadata_df[final_patch_column] = processed_patches # type: ignore[assignment]
|
|
891
896
|
|
|
892
897
|
# Run validations on the processed dataframe
|
|
893
898
|
if validate:
|
|
@@ -1054,8 +1059,26 @@ class SpansClient:
|
|
|
1054
1059
|
|
|
1055
1060
|
Retrieves trace/span data from the specified project within a time range
|
|
1056
1061
|
and writes it directly to a Parquet file at the specified path. Supports
|
|
1057
|
-
filtering with SQL-like WHERE clauses
|
|
1058
|
-
|
|
1062
|
+
filtering with SQL-like WHERE clauses for efficient querying. Ideal for
|
|
1063
|
+
large datasets and long-term storage.
|
|
1064
|
+
|
|
1065
|
+
Args:
|
|
1066
|
+
path: The file path where the Parquet file will be written.
|
|
1067
|
+
space_id: The space ID where the project resides.
|
|
1068
|
+
project_name: The name of the project to export span data from.
|
|
1069
|
+
start_time: Start of the time range (inclusive) as a datetime object.
|
|
1070
|
+
end_time: End of the time range (inclusive) as a datetime object.
|
|
1071
|
+
where: Optional SQL-like WHERE clause to filter rows (e.g., "span.status_code = 'ERROR'").
|
|
1072
|
+
columns: Optional list of column names to include. If None, all columns are returned.
|
|
1073
|
+
stream_chunk_size: Optional chunk size for streaming large result sets.
|
|
1074
|
+
|
|
1075
|
+
Raises:
|
|
1076
|
+
RuntimeError: If the Flight client request fails or returns no response.
|
|
1077
|
+
|
|
1078
|
+
Notes:
|
|
1079
|
+
- Uses Apache Arrow Flight for efficient data transfer
|
|
1080
|
+
- Data is written directly to the specified path as a Parquet file
|
|
1081
|
+
- Large exports may benefit from specifying stream_chunk_size
|
|
1059
1082
|
"""
|
|
1060
1083
|
with ArizeFlightClient(
|
|
1061
1084
|
api_key=self._sdk_config.api_key,
|
|
@@ -1068,7 +1091,7 @@ class SpansClient:
|
|
|
1068
1091
|
exporter = ArizeExportClient(
|
|
1069
1092
|
flight_client=flight_client,
|
|
1070
1093
|
)
|
|
1071
|
-
|
|
1094
|
+
exporter.export_to_parquet(
|
|
1072
1095
|
path=path,
|
|
1073
1096
|
space_id=space_id,
|
|
1074
1097
|
model_id=project_name,
|
|
@@ -1082,6 +1105,15 @@ class SpansClient:
|
|
|
1082
1105
|
|
|
1083
1106
|
|
|
1084
1107
|
def _build_patch_document(row: pd.Series) -> dict[str, object]:
|
|
1108
|
+
"""Build a patch document from a pandas Series row by extracting metadata fields.
|
|
1109
|
+
|
|
1110
|
+
Args:
|
|
1111
|
+
row: A pandas Series representing a row of data with potential metadata columns.
|
|
1112
|
+
|
|
1113
|
+
Returns:
|
|
1114
|
+
dict[str, object]: A dictionary mapping metadata field names (without the
|
|
1115
|
+
'attributes.metadata.' prefix) to their values, preserving arrays and scalars.
|
|
1116
|
+
"""
|
|
1085
1117
|
# Extract and preserve metadata values with proper types
|
|
1086
1118
|
patch = {}
|
|
1087
1119
|
for key in row.index:
|
|
@@ -1103,9 +1135,21 @@ def _build_patch_document(row: pd.Series) -> dict[str, object]:
|
|
|
1103
1135
|
def _process_patch_document(
|
|
1104
1136
|
metadata_df: pd.DataFrame,
|
|
1105
1137
|
patch_document_column_name: str,
|
|
1106
|
-
field_patches: pd.
|
|
1138
|
+
field_patches: pd.Series[Any],
|
|
1107
1139
|
row_idx: int,
|
|
1108
1140
|
) -> dict[str, object]:
|
|
1141
|
+
"""Process and merge patch documents from field patches and explicit patch column.
|
|
1142
|
+
|
|
1143
|
+
Args:
|
|
1144
|
+
metadata_df: DataFrame containing the metadata with patch documents.
|
|
1145
|
+
patch_document_column_name: Name of the column containing explicit patch documents.
|
|
1146
|
+
field_patches: DataFrame containing patches derived from individual metadata fields.
|
|
1147
|
+
row_idx: The row index to process.
|
|
1148
|
+
|
|
1149
|
+
Returns:
|
|
1150
|
+
dict[str, object]: Merged patch document where explicit patches take precedence over
|
|
1151
|
+
field patches. Returns empty dict if patch document is invalid or missing.
|
|
1152
|
+
"""
|
|
1109
1153
|
# Get the field patch for this row
|
|
1110
1154
|
field_patch = field_patches.iloc[row_idx]
|
|
1111
1155
|
|
|
@@ -1152,9 +1196,21 @@ def _ensure_dict_patch(
|
|
|
1152
1196
|
metadata_df: pd.DataFrame,
|
|
1153
1197
|
final_patch_column: str,
|
|
1154
1198
|
row_idx: int,
|
|
1155
|
-
) -> tuple[dict[str, object], list[
|
|
1199
|
+
) -> tuple[dict[str, object], list[ValidationError]]:
|
|
1200
|
+
"""Ensure a patch value is a dictionary, converting from JSON string if needed.
|
|
1201
|
+
|
|
1202
|
+
Args:
|
|
1203
|
+
metadata_df: DataFrame containing the patch data.
|
|
1204
|
+
final_patch_column: Name of the column containing the final patch document.
|
|
1205
|
+
row_idx: The row index to process.
|
|
1206
|
+
|
|
1207
|
+
Returns:
|
|
1208
|
+
tuple[dict[str, object], list[ValidationError]]: A tuple containing:
|
|
1209
|
+
- The patch as a dictionary (empty dict if invalid or missing)
|
|
1210
|
+
- List of validation errors (empty if no errors)
|
|
1211
|
+
"""
|
|
1156
1212
|
patch = metadata_df.loc[row_idx, final_patch_column]
|
|
1157
|
-
validation_errors = []
|
|
1213
|
+
validation_errors: list[ValidationError] = []
|
|
1158
1214
|
|
|
1159
1215
|
# For None/null values, return an empty dict
|
|
1160
1216
|
if patch is None:
|
|
@@ -1173,25 +1229,26 @@ def _ensure_dict_patch(
|
|
|
1173
1229
|
try:
|
|
1174
1230
|
parsed = json.loads(patch)
|
|
1175
1231
|
if isinstance(parsed, dict):
|
|
1176
|
-
return parsed
|
|
1232
|
+
return parsed, validation_errors
|
|
1177
1233
|
except json.JSONDecodeError as e:
|
|
1178
|
-
error_msg = f"
|
|
1179
|
-
logger.warning(error_msg)
|
|
1180
|
-
validation_errors.append(
|
|
1234
|
+
error_msg = f"Invalid JSON in patch document: {e}"
|
|
1235
|
+
logger.warning(f"Row {row_idx}: {error_msg}")
|
|
1236
|
+
validation_errors.append(
|
|
1237
|
+
InvalidPatchDocumentFormat(row_idx, error_msg)
|
|
1238
|
+
)
|
|
1181
1239
|
return {}, validation_errors # if not validate else None
|
|
1182
1240
|
else:
|
|
1183
|
-
error_msg = (
|
|
1184
|
-
|
|
1185
|
-
|
|
1241
|
+
error_msg = f"JSON must be an object/dictionary, got {type(parsed).__name__}"
|
|
1242
|
+
logger.warning(f"Row {row_idx}: {error_msg}")
|
|
1243
|
+
validation_errors.append(
|
|
1244
|
+
InvalidPatchDocumentFormat(row_idx, error_msg)
|
|
1186
1245
|
)
|
|
1187
|
-
logger.warning(error_msg)
|
|
1188
|
-
validation_errors.append(error_msg)
|
|
1189
1246
|
return {}, validation_errors # if not validate else None
|
|
1190
1247
|
|
|
1191
1248
|
# For other types, log warning
|
|
1192
|
-
error_msg = f"
|
|
1193
|
-
logger.warning(error_msg)
|
|
1194
|
-
validation_errors.append(error_msg)
|
|
1249
|
+
error_msg = f"Unsupported patch type: {type(patch).__name__}"
|
|
1250
|
+
logger.warning(f"Row {row_idx}: {error_msg}")
|
|
1251
|
+
validation_errors.append(InvalidPatchDocumentFormat(row_idx, error_msg))
|
|
1195
1252
|
return {}, validation_errors # if not validate else None
|
|
1196
1253
|
|
|
1197
1254
|
|
|
@@ -1199,6 +1256,16 @@ def _format_note_for_storage(
|
|
|
1199
1256
|
note_text: str,
|
|
1200
1257
|
current_time_ms: int,
|
|
1201
1258
|
) -> list[str] | None:
|
|
1259
|
+
"""Format a note text into a JSON-serialized list for storage.
|
|
1260
|
+
|
|
1261
|
+
Args:
|
|
1262
|
+
note_text: The note text content to format.
|
|
1263
|
+
current_time_ms: The current timestamp in milliseconds.
|
|
1264
|
+
|
|
1265
|
+
Returns:
|
|
1266
|
+
list[str] | None: A list containing a single JSON string with note metadata
|
|
1267
|
+
(text, updated_by, updated_at), or None if note_text is NaN/missing.
|
|
1268
|
+
"""
|
|
1202
1269
|
if pd.isna(note_text):
|
|
1203
1270
|
return None
|
|
1204
1271
|
note_obj = {
|
|
@@ -1215,6 +1282,19 @@ def _log_flight_update_summary(
|
|
|
1215
1282
|
request_type: FlightRequestType,
|
|
1216
1283
|
response: FlightPostArrowFileResponse,
|
|
1217
1284
|
) -> None:
|
|
1285
|
+
"""Log a structured summary of Flight update results including metrics and errors.
|
|
1286
|
+
|
|
1287
|
+
Args:
|
|
1288
|
+
project_name: Name of the project being updated.
|
|
1289
|
+
total_spans: Total number of spans in the update request.
|
|
1290
|
+
request_type: The type of Flight request being performed.
|
|
1291
|
+
response: The Flight response object containing update results and errors.
|
|
1292
|
+
|
|
1293
|
+
Notes:
|
|
1294
|
+
Logs one summary line with aggregated metrics, plus individual error lines
|
|
1295
|
+
for any failed span updates. Metrics include success rate, spans processed,
|
|
1296
|
+
and failure counts.
|
|
1297
|
+
"""
|
|
1218
1298
|
spans_updated = getattr(response, "spans_updated", None)
|
|
1219
1299
|
if spans_updated is None:
|
|
1220
1300
|
# Fallback for older response types
|
|
@@ -1278,6 +1358,18 @@ def _message_to_dict(
|
|
|
1278
1358
|
preserve_names: bool = True,
|
|
1279
1359
|
use_int_enums: bool = False,
|
|
1280
1360
|
) -> dict[str, object]:
|
|
1361
|
+
"""Convert a protobuf Message to a dictionary representation.
|
|
1362
|
+
|
|
1363
|
+
Args:
|
|
1364
|
+
msg: The protobuf Message to convert.
|
|
1365
|
+
preserve_names: If True, preserve original proto field names. If False, use
|
|
1366
|
+
lowerCamelCase names. Defaults to True.
|
|
1367
|
+
use_int_enums: If True, represent enum values as integers. If False, use
|
|
1368
|
+
enum string names. Defaults to False.
|
|
1369
|
+
|
|
1370
|
+
Returns:
|
|
1371
|
+
dict[str, object]: Dictionary representation of the protobuf message.
|
|
1372
|
+
"""
|
|
1281
1373
|
return json_format.MessageToDict(
|
|
1282
1374
|
msg,
|
|
1283
1375
|
preserving_proto_field_name=preserve_names,
|
arize/spans/conversion.py
CHANGED
|
@@ -120,17 +120,19 @@ def is_missing_value(value: object) -> bool:
|
|
|
120
120
|
np.inf,
|
|
121
121
|
-np.inf,
|
|
122
122
|
)
|
|
123
|
-
return value in assumed_missing_values or pd.isna(value)
|
|
123
|
+
return value in assumed_missing_values or pd.isna(value) # type: ignore[call-overload]
|
|
124
124
|
|
|
125
125
|
|
|
126
126
|
def _jsonify_list_of_dicts(
|
|
127
127
|
list_of_dicts: Iterable[dict[str, object]] | None,
|
|
128
128
|
) -> list[str]:
|
|
129
|
-
if
|
|
130
|
-
list_of_dicts
|
|
131
|
-
):
|
|
129
|
+
if list_of_dicts is None or is_missing_value(list_of_dicts):
|
|
132
130
|
return []
|
|
133
|
-
return [
|
|
131
|
+
return [
|
|
132
|
+
result
|
|
133
|
+
for d in list_of_dicts
|
|
134
|
+
if (result := _jsonify_dict(d)) is not None
|
|
135
|
+
]
|
|
134
136
|
|
|
135
137
|
|
|
136
138
|
def _jsonify_dict(d: dict[str, object] | None) -> str | None:
|
|
@@ -65,7 +65,7 @@ def check_invalid_annotation_column_names(
|
|
|
65
65
|
df: pd.DataFrame,
|
|
66
66
|
) -> list[ValidationError]:
|
|
67
67
|
"""Checks for columns that start with 'annotation.' but don't match the expected pattern."""
|
|
68
|
-
errors = []
|
|
68
|
+
errors: list[ValidationError] = []
|
|
69
69
|
|
|
70
70
|
invalid_annotation_columns = [
|
|
71
71
|
col
|
|
@@ -78,7 +78,7 @@ def check_annotation_updated_at_timestamp(
|
|
|
78
78
|
df: pd.DataFrame,
|
|
79
79
|
col_name: str,
|
|
80
80
|
is_required: bool,
|
|
81
|
-
) -> list[
|
|
81
|
+
) -> list[ValidationError]:
|
|
82
82
|
"""Validates annotation timestamp values for validity and acceptable ranges.
|
|
83
83
|
|
|
84
84
|
Checks that timestamp values are positive, not in the future, and satisfy
|
|
@@ -96,7 +96,7 @@ def check_annotation_updated_at_timestamp(
|
|
|
96
96
|
if col_name not in df.columns:
|
|
97
97
|
return []
|
|
98
98
|
|
|
99
|
-
errors = []
|
|
99
|
+
errors: list[ValidationError] = []
|
|
100
100
|
if is_required and df[col_name].isnull().any():
|
|
101
101
|
errors.append(
|
|
102
102
|
InvalidMissingValueInColumn(
|
|
@@ -131,7 +131,7 @@ def check_annotation_cols(
|
|
|
131
131
|
dataframe: pd.DataFrame,
|
|
132
132
|
) -> list[ValidationError]:
|
|
133
133
|
"""Checks value length and validity for columns matching annotation patterns."""
|
|
134
|
-
checks = []
|
|
134
|
+
checks: list[list[ValidationError]] = []
|
|
135
135
|
for col in dataframe.columns:
|
|
136
136
|
if col.endswith(ANNOTATION_LABEL_SUFFIX):
|
|
137
137
|
checks.append(
|
|
@@ -140,7 +140,8 @@ def check_annotation_cols(
|
|
|
140
140
|
col_name=col,
|
|
141
141
|
min_len=ANNOTATION_LABEL_MIN_STR_LENGTH,
|
|
142
142
|
max_len=ANNOTATION_LABEL_MAX_STR_LENGTH,
|
|
143
|
-
|
|
143
|
+
# Individual columns are not required
|
|
144
|
+
is_required=False,
|
|
144
145
|
)
|
|
145
146
|
)
|
|
146
147
|
elif col.endswith(ANNOTATION_SCORE_SUFFIX):
|
|
@@ -231,15 +232,11 @@ def check_annotation_notes_column(
|
|
|
231
232
|
col_name = ANNOTATION_NOTES_COLUMN_NAME
|
|
232
233
|
if col_name in dataframe.columns:
|
|
233
234
|
# Validate the length of the raw string
|
|
234
|
-
return
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
max_len=ANNOTATION_NOTES_MAX_STR_LENGTH,
|
|
241
|
-
is_required=False,
|
|
242
|
-
)
|
|
243
|
-
)
|
|
235
|
+
return common_value_validation.check_string_column_value_length(
|
|
236
|
+
df=dataframe,
|
|
237
|
+
col_name=col_name,
|
|
238
|
+
min_len=0, # Allow empty notes
|
|
239
|
+
max_len=ANNOTATION_NOTES_MAX_STR_LENGTH,
|
|
240
|
+
is_required=False,
|
|
244
241
|
)
|
|
245
242
|
return []
|
|
@@ -67,5 +67,5 @@ def check_dataframe_for_duplicate_columns(
|
|
|
67
67
|
# Get the duplicated column names from the dataframe
|
|
68
68
|
duplicate_columns = df.columns[df.columns.duplicated()]
|
|
69
69
|
if not duplicate_columns.empty:
|
|
70
|
-
return [InvalidDataFrameDuplicateColumns(duplicate_columns)]
|
|
70
|
+
return [InvalidDataFrameDuplicateColumns(duplicate_columns.tolist())]
|
|
71
71
|
return []
|
|
@@ -11,6 +11,7 @@ from arize.constants.ml import (
|
|
|
11
11
|
MAX_FUTURE_YEARS_FROM_CURRENT_TIME,
|
|
12
12
|
MAX_PAST_YEARS_FROM_CURRENT_TIME,
|
|
13
13
|
)
|
|
14
|
+
from arize.exceptions.base import ValidationError
|
|
14
15
|
from arize.exceptions.parameters import InvalidModelVersion, InvalidProjectName
|
|
15
16
|
from arize.spans.columns import (
|
|
16
17
|
SPAN_END_TIME_COL,
|
|
@@ -73,7 +74,7 @@ def check_string_column_value_length(
|
|
|
73
74
|
max_len: int,
|
|
74
75
|
is_required: bool,
|
|
75
76
|
must_be_json: bool = False,
|
|
76
|
-
) -> list[
|
|
77
|
+
) -> list[ValidationError]:
|
|
77
78
|
"""Validate string column values are within length bounds and optionally valid JSON.
|
|
78
79
|
|
|
79
80
|
Args:
|
|
@@ -90,7 +91,7 @@ def check_string_column_value_length(
|
|
|
90
91
|
if col_name not in df.columns:
|
|
91
92
|
return []
|
|
92
93
|
|
|
93
|
-
errors = []
|
|
94
|
+
errors: list[ValidationError] = []
|
|
94
95
|
if is_required and df[col_name].isnull().any():
|
|
95
96
|
errors.append(
|
|
96
97
|
InvalidMissingValueInColumn(
|
|
@@ -129,7 +130,7 @@ def check_string_column_allowed_values(
|
|
|
129
130
|
col_name: str,
|
|
130
131
|
allowed_values: list[str],
|
|
131
132
|
is_required: bool,
|
|
132
|
-
) -> list[
|
|
133
|
+
) -> list[ValidationError]:
|
|
133
134
|
"""Validate that string column values are within allowed values.
|
|
134
135
|
|
|
135
136
|
Args:
|
|
@@ -144,7 +145,7 @@ def check_string_column_allowed_values(
|
|
|
144
145
|
if col_name not in df.columns:
|
|
145
146
|
return []
|
|
146
147
|
|
|
147
|
-
errors = []
|
|
148
|
+
errors: list[ValidationError] = []
|
|
148
149
|
if is_required and df[col_name].isnull().any():
|
|
149
150
|
errors.append(
|
|
150
151
|
InvalidMissingValueInColumn(
|
|
@@ -177,7 +178,7 @@ def check_string_column_allowed_values(
|
|
|
177
178
|
def check_float_column_valid_numbers(
|
|
178
179
|
df: pd.DataFrame,
|
|
179
180
|
col_name: str,
|
|
180
|
-
) -> list[
|
|
181
|
+
) -> list[ValidationError]:
|
|
181
182
|
"""Check that float column contains only finite numbers, no infinity values.
|
|
182
183
|
|
|
183
184
|
Args:
|
|
@@ -201,11 +202,7 @@ def check_float_column_valid_numbers(
|
|
|
201
202
|
|
|
202
203
|
def check_value_columns_start_end_time(
|
|
203
204
|
df: pd.DataFrame,
|
|
204
|
-
) -> list[
|
|
205
|
-
InvalidMissingValueInColumn
|
|
206
|
-
| InvalidTimestampValueInColumn
|
|
207
|
-
| InvalidStartAndEndTimeValuesInColumn
|
|
208
|
-
]:
|
|
205
|
+
) -> list[ValidationError]:
|
|
209
206
|
"""Validate start and end time columns for timestamps and logical ordering.
|
|
210
207
|
|
|
211
208
|
Args:
|
|
@@ -214,7 +211,7 @@ def check_value_columns_start_end_time(
|
|
|
214
211
|
Returns:
|
|
215
212
|
List of validation errors for missing values, invalid timestamps, or start > end.
|
|
216
213
|
"""
|
|
217
|
-
errors = []
|
|
214
|
+
errors: list[ValidationError] = []
|
|
218
215
|
errors += check_value_timestamp(
|
|
219
216
|
df=df,
|
|
220
217
|
col_name=SPAN_START_TIME_COL.name,
|
|
@@ -243,7 +240,7 @@ def check_value_timestamp(
|
|
|
243
240
|
df: pd.DataFrame,
|
|
244
241
|
col_name: str,
|
|
245
242
|
is_required: bool,
|
|
246
|
-
) -> list[
|
|
243
|
+
) -> list[ValidationError]:
|
|
247
244
|
"""Validate timestamp column values are within reasonable bounds.
|
|
248
245
|
|
|
249
246
|
Args:
|
|
@@ -258,7 +255,7 @@ def check_value_timestamp(
|
|
|
258
255
|
if col_name not in df.columns:
|
|
259
256
|
return []
|
|
260
257
|
|
|
261
|
-
errors = []
|
|
258
|
+
errors: list[ValidationError] = []
|
|
262
259
|
if is_required and df[col_name].isnull().any():
|
|
263
260
|
errors.append(
|
|
264
261
|
InvalidMissingValueInColumn(
|
|
@@ -34,6 +34,28 @@ class MetadataValueError(ValidationError):
|
|
|
34
34
|
return f"{self.message} {self.resolution}"
|
|
35
35
|
|
|
36
36
|
|
|
37
|
+
class InvalidPatchDocumentFormat(ValidationError):
|
|
38
|
+
"""Raised when patch document format is invalid or cannot be parsed."""
|
|
39
|
+
|
|
40
|
+
def __init__(self, row_idx: int, message: str) -> None:
|
|
41
|
+
"""Initialize the exception with patch document format error context.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
row_idx: The row index where the invalid patch was found.
|
|
45
|
+
message: Detailed error message describing the format issue.
|
|
46
|
+
"""
|
|
47
|
+
self.row_idx = row_idx
|
|
48
|
+
self.message = message
|
|
49
|
+
|
|
50
|
+
def __repr__(self) -> str:
|
|
51
|
+
"""Return a string representation for debugging and logging."""
|
|
52
|
+
return "Invalid_Patch_Document_Format"
|
|
53
|
+
|
|
54
|
+
def error_message(self) -> str:
|
|
55
|
+
"""Return the error message for this exception."""
|
|
56
|
+
return f"Row {self.row_idx}: {self.message}"
|
|
57
|
+
|
|
58
|
+
|
|
37
59
|
def calculate_json_depth(obj: object, current_depth: int = 1) -> int:
|
|
38
60
|
"""Calculate the maximum nesting depth of a JSON object.
|
|
39
61
|
|
|
@@ -67,7 +89,7 @@ def validate_values(
|
|
|
67
89
|
Returns:
|
|
68
90
|
A list of validation errors, empty if none found
|
|
69
91
|
"""
|
|
70
|
-
errors = []
|
|
92
|
+
errors: list[ValidationError] = []
|
|
71
93
|
|
|
72
94
|
# Skip validation if span_id column is not present
|
|
73
95
|
if SPAN_SPAN_ID_COL.name not in metadata_dataframe.columns:
|
arize/utils/arrow.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# type: ignore[pb2]
|
|
2
1
|
"""Apache Arrow utilities for data serialization and file operations."""
|
|
3
2
|
|
|
4
3
|
from __future__ import annotations
|
|
@@ -124,6 +123,18 @@ def post_arrow_table(
|
|
|
124
123
|
def _append_to_pyarrow_metadata(
|
|
125
124
|
pa_schema: pa.Schema, new_metadata: dict[str, Any]
|
|
126
125
|
) -> object:
|
|
126
|
+
"""Append metadata to a PyArrow schema without overwriting existing keys.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
pa_schema: The PyArrow schema to add metadata to.
|
|
130
|
+
new_metadata: Dictionary of metadata key-value pairs to append.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
pa.Schema: A new PyArrow schema with the merged metadata.
|
|
134
|
+
|
|
135
|
+
Raises:
|
|
136
|
+
KeyError: If any keys in new_metadata conflict with existing schema metadata.
|
|
137
|
+
"""
|
|
127
138
|
# Ensure metadata is handled correctly, even if initially None.
|
|
128
139
|
metadata = pa_schema.metadata
|
|
129
140
|
if metadata is None:
|
|
@@ -145,6 +156,14 @@ def _append_to_pyarrow_metadata(
|
|
|
145
156
|
def _write_arrow_file(
|
|
146
157
|
path: str, pa_table: pa.Table, pa_schema: pa.Schema, max_chunksize: int
|
|
147
158
|
) -> None:
|
|
159
|
+
"""Write a PyArrow table to an Arrow IPC file with specified schema and chunk size.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
path: The file path where the Arrow file will be written.
|
|
163
|
+
pa_table: The PyArrow table containing the data to write.
|
|
164
|
+
pa_schema: The PyArrow schema to use for the file.
|
|
165
|
+
max_chunksize: Maximum number of rows per record batch chunk.
|
|
166
|
+
"""
|
|
148
167
|
with (
|
|
149
168
|
pa.OSFile(path, mode="wb") as sink,
|
|
150
169
|
pa.ipc.RecordBatchStreamWriter(sink, pa_schema) as writer,
|
|
@@ -153,6 +172,15 @@ def _write_arrow_file(
|
|
|
153
172
|
|
|
154
173
|
|
|
155
174
|
def _maybe_log_project_url(response: requests.Response) -> None:
|
|
175
|
+
"""Attempt to extract and log the Arize project URL from an HTTP response.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
response: The HTTP response object from an Arize API request.
|
|
179
|
+
|
|
180
|
+
Notes:
|
|
181
|
+
Logs success message with URL if extraction succeeds, or warning if it fails.
|
|
182
|
+
This function never raises exceptions.
|
|
183
|
+
"""
|
|
156
184
|
try:
|
|
157
185
|
url = get_arize_project_url(response)
|
|
158
186
|
if url:
|
|
@@ -176,6 +204,14 @@ def _mktemp_in(directory: str) -> str:
|
|
|
176
204
|
|
|
177
205
|
|
|
178
206
|
def _filesize(path: str) -> int:
|
|
207
|
+
"""Get the size of a file in bytes.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
path: The file path to check.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
int: The file size in bytes, or -1 if the file cannot be accessed.
|
|
214
|
+
"""
|
|
179
215
|
try:
|
|
180
216
|
return os.path.getsize(path)
|
|
181
217
|
except Exception:
|
|
@@ -81,9 +81,12 @@ def extract_nested_data_to_column(
|
|
|
81
81
|
remainder = ".".join(parts[prefix_len:])
|
|
82
82
|
|
|
83
83
|
# 3) Apply introspect row-by-row
|
|
84
|
+
# Type narrowing: prefix_col is guaranteed to be str after the None check above
|
|
85
|
+
prefix_col_str: str = prefix_col
|
|
86
|
+
|
|
84
87
|
def apply_introspect_arize_attribute(
|
|
85
|
-
row: pd.Series,
|
|
86
|
-
prefix_col: str =
|
|
88
|
+
row: pd.Series, # type: ignore[type-arg]
|
|
89
|
+
prefix_col: str = prefix_col_str,
|
|
87
90
|
remainder: str = remainder,
|
|
88
91
|
) -> object:
|
|
89
92
|
val = row[prefix_col]
|
|
@@ -94,8 +97,9 @@ def extract_nested_data_to_column(
|
|
|
94
97
|
else:
|
|
95
98
|
return result if result is not None else np.nan
|
|
96
99
|
|
|
97
|
-
result_df[attribute] = result_df.apply(
|
|
98
|
-
apply_introspect_arize_attribute,
|
|
100
|
+
result_df[attribute] = result_df.apply( # type: ignore[call-overload]
|
|
101
|
+
apply_introspect_arize_attribute,
|
|
102
|
+
axis=1,
|
|
99
103
|
)
|
|
100
104
|
|
|
101
105
|
new_cols.append(attribute)
|
arize/utils/proto.py
CHANGED
arize/utils/types.py
CHANGED
|
@@ -43,7 +43,7 @@ def is_array_of(arr: Sequence[object], tp: T) -> bool:
|
|
|
43
43
|
return isinstance(arr, np.ndarray) and all(isinstance(x, tp) for x in arr)
|
|
44
44
|
|
|
45
45
|
|
|
46
|
-
def is_list_of(lst:
|
|
46
|
+
def is_list_of(lst: object, tp: T) -> bool:
|
|
47
47
|
"""Check if a value is a list with all elements of a specific type.
|
|
48
48
|
|
|
49
49
|
Args:
|
|
@@ -70,10 +70,10 @@ def is_iterable_of(lst: Sequence[object], tp: T) -> bool:
|
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
def is_dict_of(
|
|
73
|
-
d:
|
|
74
|
-
key_allowed_types:
|
|
75
|
-
value_allowed_types:
|
|
76
|
-
value_list_allowed_types:
|
|
73
|
+
d: object,
|
|
74
|
+
key_allowed_types: type | tuple[type, ...],
|
|
75
|
+
value_allowed_types: type | tuple[type, ...] = (),
|
|
76
|
+
value_list_allowed_types: type | tuple[type, ...] = (),
|
|
77
77
|
) -> bool:
|
|
78
78
|
"""Method to check types are valid for dictionary.
|
|
79
79
|
|
|
@@ -98,7 +98,7 @@ def is_dict_of(
|
|
|
98
98
|
and all(isinstance(k, key_allowed_types) for k in d)
|
|
99
99
|
and all(
|
|
100
100
|
isinstance(v, value_allowed_types)
|
|
101
|
-
or any(is_list_of(v, t) for t in value_list_allowed_types)
|
|
101
|
+
or any(is_list_of(v, t) for t in value_list_allowed_types) # type: ignore[union-attr]
|
|
102
102
|
for v in d.values()
|
|
103
103
|
if value_allowed_types or value_list_allowed_types
|
|
104
104
|
)
|