arize 8.0.0b2__py3-none-any.whl → 8.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. arize/__init__.py +8 -1
  2. arize/_exporter/client.py +18 -17
  3. arize/_exporter/parsers/tracing_data_parser.py +9 -4
  4. arize/_exporter/validation.py +1 -1
  5. arize/_flight/client.py +33 -13
  6. arize/_lazy.py +37 -2
  7. arize/client.py +61 -35
  8. arize/config.py +168 -14
  9. arize/constants/config.py +1 -0
  10. arize/datasets/client.py +32 -19
  11. arize/embeddings/auto_generator.py +14 -7
  12. arize/embeddings/base_generators.py +15 -9
  13. arize/embeddings/cv_generators.py +2 -2
  14. arize/embeddings/nlp_generators.py +8 -8
  15. arize/embeddings/tabular_generators.py +5 -5
  16. arize/exceptions/config.py +22 -0
  17. arize/exceptions/parameters.py +1 -1
  18. arize/exceptions/values.py +8 -5
  19. arize/experiments/__init__.py +4 -0
  20. arize/experiments/client.py +17 -11
  21. arize/experiments/evaluators/base.py +6 -3
  22. arize/experiments/evaluators/executors.py +6 -4
  23. arize/experiments/evaluators/rate_limiters.py +3 -1
  24. arize/experiments/evaluators/types.py +7 -5
  25. arize/experiments/evaluators/utils.py +7 -5
  26. arize/experiments/functions.py +111 -48
  27. arize/experiments/tracing.py +4 -1
  28. arize/experiments/types.py +31 -26
  29. arize/logging.py +53 -32
  30. arize/ml/batch_validation/validator.py +82 -70
  31. arize/ml/bounded_executor.py +25 -6
  32. arize/ml/casting.py +45 -27
  33. arize/ml/client.py +35 -28
  34. arize/ml/proto.py +16 -17
  35. arize/ml/stream_validation.py +63 -25
  36. arize/ml/surrogate_explainer/mimic.py +15 -7
  37. arize/ml/types.py +26 -12
  38. arize/pre_releases.py +7 -6
  39. arize/py.typed +0 -0
  40. arize/regions.py +10 -10
  41. arize/spans/client.py +113 -21
  42. arize/spans/conversion.py +7 -5
  43. arize/spans/validation/annotations/dataframe_form_validation.py +1 -1
  44. arize/spans/validation/annotations/value_validation.py +11 -14
  45. arize/spans/validation/common/dataframe_form_validation.py +1 -1
  46. arize/spans/validation/common/value_validation.py +10 -13
  47. arize/spans/validation/evals/value_validation.py +1 -1
  48. arize/spans/validation/metadata/argument_validation.py +1 -1
  49. arize/spans/validation/metadata/dataframe_form_validation.py +1 -1
  50. arize/spans/validation/metadata/value_validation.py +23 -1
  51. arize/utils/arrow.py +37 -1
  52. arize/utils/online_tasks/dataframe_preprocessor.py +8 -4
  53. arize/utils/proto.py +0 -1
  54. arize/utils/types.py +6 -6
  55. arize/version.py +1 -1
  56. {arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/METADATA +18 -3
  57. {arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/RECORD +60 -58
  58. {arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/WHEEL +0 -0
  59. {arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/licenses/LICENSE +0 -0
  60. {arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/licenses/NOTICE +0 -0
arize/regions.py CHANGED
@@ -1,18 +1,18 @@
1
1
  """Region definitions and configuration for Arize deployment zones."""
2
2
 
3
3
  from dataclasses import dataclass
4
- from enum import StrEnum
4
+ from enum import Enum
5
5
 
6
6
  from arize.constants.config import DEFAULT_FLIGHT_PORT
7
7
 
8
8
 
9
- class Region(StrEnum):
9
+ class Region(Enum):
10
10
  """Enum representing available Arize deployment regions."""
11
11
 
12
- US_CENTRAL_1 = "us-central-1a"
13
- EU_WEST_1 = "eu-west-1a"
14
- CA_CENTRAL_1 = "ca-central-1a"
15
- US_EAST_1 = "us-east-1b"
12
+ CA_CENTRAL_1A = "ca-central-1a"
13
+ EU_WEST_1A = "eu-west-1a"
14
+ US_CENTRAL_1A = "us-central-1a"
15
+ US_EAST_1B = "us-east-1b"
16
16
  UNSET = ""
17
17
 
18
18
 
@@ -28,13 +28,13 @@ class RegionEndpoints:
28
28
 
29
29
  def _get_region_endpoints(region: Region) -> RegionEndpoints:
30
30
  return RegionEndpoints(
31
- api_host=f"api.{region}.arize.com",
32
- otlp_host=f"otlp.{region}.arize.com",
33
- flight_host=f"flight.{region}.arize.com",
31
+ api_host=f"api.{region.value}.arize.com",
32
+ otlp_host=f"otlp.{region.value}.arize.com",
33
+ flight_host=f"flight.{region.value}.arize.com",
34
34
  flight_port=DEFAULT_FLIGHT_PORT,
35
35
  )
36
36
 
37
37
 
38
38
  REGION_ENDPOINTS: dict[Region, RegionEndpoints] = {
39
- r: _get_region_endpoints(r) for r in Region if r != Region.UNSET
39
+ r: _get_region_endpoints(r) for r in list(Region) if r != Region.UNSET
40
40
  }
arize/spans/client.py CHANGED
@@ -1,4 +1,3 @@
1
- # type: ignore[pb2]
2
1
  """Client implementation for managing spans and traces in the Arize platform."""
3
2
 
4
3
  from __future__ import annotations
@@ -21,12 +20,16 @@ from arize._flight.types import FlightRequestType
21
20
  from arize.constants.spans import DEFAULT_DATETIME_FMT
22
21
  from arize.exceptions.base import (
23
22
  INVALID_ARROW_CONVERSION_MSG,
23
+ ValidationError,
24
24
  ValidationFailure,
25
25
  )
26
26
  from arize.exceptions.models import MissingProjectNameError
27
27
  from arize.exceptions.spaces import MissingSpaceIDError
28
28
  from arize.logging import CtxAdapter
29
29
  from arize.ml.types import Environments
30
+ from arize.spans.validation.metadata.value_validation import (
31
+ InvalidPatchDocumentFormat,
32
+ )
30
33
  from arize.utils.arrow import post_arrow_table
31
34
  from arize.utils.dataframe import (
32
35
  remove_extraneous_columns,
@@ -850,7 +853,8 @@ class SpansClient:
850
853
  )
851
854
  for idx in range(len(metadata_df))
852
855
  ]
853
- metadata_df[final_patch_column] = merged_patches
856
+ # Type ignore: pandas DataFrame column assignment type is overly restrictive
857
+ metadata_df[final_patch_column] = merged_patches # type: ignore[assignment]
854
858
  else:
855
859
  # Just use the field patches directly
856
860
  metadata_df[final_patch_column] = field_patches
@@ -887,7 +891,8 @@ class SpansClient:
887
891
  log.error(e)
888
892
  raise ValidationFailure(validation_errors)
889
893
 
890
- metadata_df[final_patch_column] = processed_patches
894
+ # Type ignore: pandas DataFrame column assignment type is overly restrictive
895
+ metadata_df[final_patch_column] = processed_patches # type: ignore[assignment]
891
896
 
892
897
  # Run validations on the processed dataframe
893
898
  if validate:
@@ -1054,8 +1059,26 @@ class SpansClient:
1054
1059
 
1055
1060
  Retrieves trace/span data from the specified project within a time range
1056
1061
  and writes it directly to a Parquet file at the specified path. Supports
1057
- filtering with SQL-like WHERE clauses and similarity search for semantic
1058
- retrieval. Efficient for large datasets and long-term storage.
1062
+ filtering with SQL-like WHERE clauses for efficient querying. Ideal for
1063
+ large datasets and long-term storage.
1064
+
1065
+ Args:
1066
+ path: The file path where the Parquet file will be written.
1067
+ space_id: The space ID where the project resides.
1068
+ project_name: The name of the project to export span data from.
1069
+ start_time: Start of the time range (inclusive) as a datetime object.
1070
+ end_time: End of the time range (inclusive) as a datetime object.
1071
+ where: Optional SQL-like WHERE clause to filter rows (e.g., "span.status_code = 'ERROR'").
1072
+ columns: Optional list of column names to include. If None, all columns are returned.
1073
+ stream_chunk_size: Optional chunk size for streaming large result sets.
1074
+
1075
+ Raises:
1076
+ RuntimeError: If the Flight client request fails or returns no response.
1077
+
1078
+ Notes:
1079
+ - Uses Apache Arrow Flight for efficient data transfer
1080
+ - Data is written directly to the specified path as a Parquet file
1081
+ - Large exports may benefit from specifying stream_chunk_size
1059
1082
  """
1060
1083
  with ArizeFlightClient(
1061
1084
  api_key=self._sdk_config.api_key,
@@ -1068,7 +1091,7 @@ class SpansClient:
1068
1091
  exporter = ArizeExportClient(
1069
1092
  flight_client=flight_client,
1070
1093
  )
1071
- return exporter.export_to_parquet(
1094
+ exporter.export_to_parquet(
1072
1095
  path=path,
1073
1096
  space_id=space_id,
1074
1097
  model_id=project_name,
@@ -1082,6 +1105,15 @@ class SpansClient:
1082
1105
 
1083
1106
 
1084
1107
  def _build_patch_document(row: pd.Series) -> dict[str, object]:
1108
+ """Build a patch document from a pandas Series row by extracting metadata fields.
1109
+
1110
+ Args:
1111
+ row: A pandas Series representing a row of data with potential metadata columns.
1112
+
1113
+ Returns:
1114
+ dict[str, object]: A dictionary mapping metadata field names (without the
1115
+ 'attributes.metadata.' prefix) to their values, preserving arrays and scalars.
1116
+ """
1085
1117
  # Extract and preserve metadata values with proper types
1086
1118
  patch = {}
1087
1119
  for key in row.index:
@@ -1103,9 +1135,21 @@ def _build_patch_document(row: pd.Series) -> dict[str, object]:
1103
1135
  def _process_patch_document(
1104
1136
  metadata_df: pd.DataFrame,
1105
1137
  patch_document_column_name: str,
1106
- field_patches: pd.DataFrame,
1138
+ field_patches: pd.Series[Any],
1107
1139
  row_idx: int,
1108
1140
  ) -> dict[str, object]:
1141
+ """Process and merge patch documents from field patches and explicit patch column.
1142
+
1143
+ Args:
1144
+ metadata_df: DataFrame containing the metadata with patch documents.
1145
+ patch_document_column_name: Name of the column containing explicit patch documents.
1146
+ field_patches: DataFrame containing patches derived from individual metadata fields.
1147
+ row_idx: The row index to process.
1148
+
1149
+ Returns:
1150
+ dict[str, object]: Merged patch document where explicit patches take precedence over
1151
+ field patches. Returns empty dict if patch document is invalid or missing.
1152
+ """
1109
1153
  # Get the field patch for this row
1110
1154
  field_patch = field_patches.iloc[row_idx]
1111
1155
 
@@ -1152,9 +1196,21 @@ def _ensure_dict_patch(
1152
1196
  metadata_df: pd.DataFrame,
1153
1197
  final_patch_column: str,
1154
1198
  row_idx: int,
1155
- ) -> tuple[dict[str, object], list[str]]:
1199
+ ) -> tuple[dict[str, object], list[ValidationError]]:
1200
+ """Ensure a patch value is a dictionary, converting from JSON string if needed.
1201
+
1202
+ Args:
1203
+ metadata_df: DataFrame containing the patch data.
1204
+ final_patch_column: Name of the column containing the final patch document.
1205
+ row_idx: The row index to process.
1206
+
1207
+ Returns:
1208
+ tuple[dict[str, object], list[ValidationError]]: A tuple containing:
1209
+ - The patch as a dictionary (empty dict if invalid or missing)
1210
+ - List of validation errors (empty if no errors)
1211
+ """
1156
1212
  patch = metadata_df.loc[row_idx, final_patch_column]
1157
- validation_errors = []
1213
+ validation_errors: list[ValidationError] = []
1158
1214
 
1159
1215
  # For None/null values, return an empty dict
1160
1216
  if patch is None:
@@ -1173,25 +1229,26 @@ def _ensure_dict_patch(
1173
1229
  try:
1174
1230
  parsed = json.loads(patch)
1175
1231
  if isinstance(parsed, dict):
1176
- return parsed
1232
+ return parsed, validation_errors
1177
1233
  except json.JSONDecodeError as e:
1178
- error_msg = f"Row {row_idx}: Invalid JSON in patch document: {e}"
1179
- logger.warning(error_msg)
1180
- validation_errors.append(error_msg)
1234
+ error_msg = f"Invalid JSON in patch document: {e}"
1235
+ logger.warning(f"Row {row_idx}: {error_msg}")
1236
+ validation_errors.append(
1237
+ InvalidPatchDocumentFormat(row_idx, error_msg)
1238
+ )
1181
1239
  return {}, validation_errors # if not validate else None
1182
1240
  else:
1183
- error_msg = (
1184
- f"Row {row_idx}: JSON must be an object/dictionary, "
1185
- f"got {type(parsed).__name__}"
1241
+ error_msg = f"JSON must be an object/dictionary, got {type(parsed).__name__}"
1242
+ logger.warning(f"Row {row_idx}: {error_msg}")
1243
+ validation_errors.append(
1244
+ InvalidPatchDocumentFormat(row_idx, error_msg)
1186
1245
  )
1187
- logger.warning(error_msg)
1188
- validation_errors.append(error_msg)
1189
1246
  return {}, validation_errors # if not validate else None
1190
1247
 
1191
1248
  # For other types, log warning
1192
- error_msg = f"Row {row_idx}: Unsupported patch type: {type(patch).__name__}"
1193
- logger.warning(error_msg)
1194
- validation_errors.append(error_msg)
1249
+ error_msg = f"Unsupported patch type: {type(patch).__name__}"
1250
+ logger.warning(f"Row {row_idx}: {error_msg}")
1251
+ validation_errors.append(InvalidPatchDocumentFormat(row_idx, error_msg))
1195
1252
  return {}, validation_errors # if not validate else None
1196
1253
 
1197
1254
 
@@ -1199,6 +1256,16 @@ def _format_note_for_storage(
1199
1256
  note_text: str,
1200
1257
  current_time_ms: int,
1201
1258
  ) -> list[str] | None:
1259
+ """Format a note text into a JSON-serialized list for storage.
1260
+
1261
+ Args:
1262
+ note_text: The note text content to format.
1263
+ current_time_ms: The current timestamp in milliseconds.
1264
+
1265
+ Returns:
1266
+ list[str] | None: A list containing a single JSON string with note metadata
1267
+ (text, updated_by, updated_at), or None if note_text is NaN/missing.
1268
+ """
1202
1269
  if pd.isna(note_text):
1203
1270
  return None
1204
1271
  note_obj = {
@@ -1215,6 +1282,19 @@ def _log_flight_update_summary(
1215
1282
  request_type: FlightRequestType,
1216
1283
  response: FlightPostArrowFileResponse,
1217
1284
  ) -> None:
1285
+ """Log a structured summary of Flight update results including metrics and errors.
1286
+
1287
+ Args:
1288
+ project_name: Name of the project being updated.
1289
+ total_spans: Total number of spans in the update request.
1290
+ request_type: The type of Flight request being performed.
1291
+ response: The Flight response object containing update results and errors.
1292
+
1293
+ Notes:
1294
+ Logs one summary line with aggregated metrics, plus individual error lines
1295
+ for any failed span updates. Metrics include success rate, spans processed,
1296
+ and failure counts.
1297
+ """
1218
1298
  spans_updated = getattr(response, "spans_updated", None)
1219
1299
  if spans_updated is None:
1220
1300
  # Fallback for older response types
@@ -1278,6 +1358,18 @@ def _message_to_dict(
1278
1358
  preserve_names: bool = True,
1279
1359
  use_int_enums: bool = False,
1280
1360
  ) -> dict[str, object]:
1361
+ """Convert a protobuf Message to a dictionary representation.
1362
+
1363
+ Args:
1364
+ msg: The protobuf Message to convert.
1365
+ preserve_names: If True, preserve original proto field names. If False, use
1366
+ lowerCamelCase names. Defaults to True.
1367
+ use_int_enums: If True, represent enum values as integers. If False, use
1368
+ enum string names. Defaults to False.
1369
+
1370
+ Returns:
1371
+ dict[str, object]: Dictionary representation of the protobuf message.
1372
+ """
1281
1373
  return json_format.MessageToDict(
1282
1374
  msg,
1283
1375
  preserving_proto_field_name=preserve_names,
arize/spans/conversion.py CHANGED
@@ -120,17 +120,19 @@ def is_missing_value(value: object) -> bool:
120
120
  np.inf,
121
121
  -np.inf,
122
122
  )
123
- return value in assumed_missing_values or pd.isna(value)
123
+ return value in assumed_missing_values or pd.isna(value) # type: ignore[call-overload]
124
124
 
125
125
 
126
126
  def _jsonify_list_of_dicts(
127
127
  list_of_dicts: Iterable[dict[str, object]] | None,
128
128
  ) -> list[str]:
129
- if not isinstance(list_of_dicts, Iterable) and is_missing_value(
130
- list_of_dicts
131
- ):
129
+ if list_of_dicts is None or is_missing_value(list_of_dicts):
132
130
  return []
133
- return [_jsonify_dict(d) for d in list_of_dicts]
131
+ return [
132
+ result
133
+ for d in list_of_dicts
134
+ if (result := _jsonify_dict(d)) is not None
135
+ ]
134
136
 
135
137
 
136
138
  def _jsonify_dict(d: dict[str, object] | None) -> str | None:
@@ -65,7 +65,7 @@ def check_invalid_annotation_column_names(
65
65
  df: pd.DataFrame,
66
66
  ) -> list[ValidationError]:
67
67
  """Checks for columns that start with 'annotation.' but don't match the expected pattern."""
68
- errors = []
68
+ errors: list[ValidationError] = []
69
69
 
70
70
  invalid_annotation_columns = [
71
71
  col
@@ -78,7 +78,7 @@ def check_annotation_updated_at_timestamp(
78
78
  df: pd.DataFrame,
79
79
  col_name: str,
80
80
  is_required: bool,
81
- ) -> list[InvalidMissingValueInColumn | InvalidAnnotationTimestamp]:
81
+ ) -> list[ValidationError]:
82
82
  """Validates annotation timestamp values for validity and acceptable ranges.
83
83
 
84
84
  Checks that timestamp values are positive, not in the future, and satisfy
@@ -96,7 +96,7 @@ def check_annotation_updated_at_timestamp(
96
96
  if col_name not in df.columns:
97
97
  return []
98
98
 
99
- errors = []
99
+ errors: list[ValidationError] = []
100
100
  if is_required and df[col_name].isnull().any():
101
101
  errors.append(
102
102
  InvalidMissingValueInColumn(
@@ -131,7 +131,7 @@ def check_annotation_cols(
131
131
  dataframe: pd.DataFrame,
132
132
  ) -> list[ValidationError]:
133
133
  """Checks value length and validity for columns matching annotation patterns."""
134
- checks = []
134
+ checks: list[list[ValidationError]] = []
135
135
  for col in dataframe.columns:
136
136
  if col.endswith(ANNOTATION_LABEL_SUFFIX):
137
137
  checks.append(
@@ -140,7 +140,8 @@ def check_annotation_cols(
140
140
  col_name=col,
141
141
  min_len=ANNOTATION_LABEL_MIN_STR_LENGTH,
142
142
  max_len=ANNOTATION_LABEL_MAX_STR_LENGTH,
143
- is_required=False, # Individual columns are not required, null check handles completeness
143
+ # Individual columns are not required
144
+ is_required=False,
144
145
  )
145
146
  )
146
147
  elif col.endswith(ANNOTATION_SCORE_SUFFIX):
@@ -231,15 +232,11 @@ def check_annotation_notes_column(
231
232
  col_name = ANNOTATION_NOTES_COLUMN_NAME
232
233
  if col_name in dataframe.columns:
233
234
  # Validate the length of the raw string
234
- return list(
235
- chain(
236
- *common_value_validation.check_string_column_value_length(
237
- df=dataframe,
238
- col_name=col_name,
239
- min_len=0, # Allow empty notes
240
- max_len=ANNOTATION_NOTES_MAX_STR_LENGTH,
241
- is_required=False,
242
- )
243
- )
235
+ return common_value_validation.check_string_column_value_length(
236
+ df=dataframe,
237
+ col_name=col_name,
238
+ min_len=0, # Allow empty notes
239
+ max_len=ANNOTATION_NOTES_MAX_STR_LENGTH,
240
+ is_required=False,
244
241
  )
245
242
  return []
@@ -67,5 +67,5 @@ def check_dataframe_for_duplicate_columns(
67
67
  # Get the duplicated column names from the dataframe
68
68
  duplicate_columns = df.columns[df.columns.duplicated()]
69
69
  if not duplicate_columns.empty:
70
- return [InvalidDataFrameDuplicateColumns(duplicate_columns)]
70
+ return [InvalidDataFrameDuplicateColumns(duplicate_columns.tolist())]
71
71
  return []
@@ -11,6 +11,7 @@ from arize.constants.ml import (
11
11
  MAX_FUTURE_YEARS_FROM_CURRENT_TIME,
12
12
  MAX_PAST_YEARS_FROM_CURRENT_TIME,
13
13
  )
14
+ from arize.exceptions.base import ValidationError
14
15
  from arize.exceptions.parameters import InvalidModelVersion, InvalidProjectName
15
16
  from arize.spans.columns import (
16
17
  SPAN_END_TIME_COL,
@@ -73,7 +74,7 @@ def check_string_column_value_length(
73
74
  max_len: int,
74
75
  is_required: bool,
75
76
  must_be_json: bool = False,
76
- ) -> list[InvalidMissingValueInColumn | InvalidStringLengthInColumn]:
77
+ ) -> list[ValidationError]:
77
78
  """Validate string column values are within length bounds and optionally valid JSON.
78
79
 
79
80
  Args:
@@ -90,7 +91,7 @@ def check_string_column_value_length(
90
91
  if col_name not in df.columns:
91
92
  return []
92
93
 
93
- errors = []
94
+ errors: list[ValidationError] = []
94
95
  if is_required and df[col_name].isnull().any():
95
96
  errors.append(
96
97
  InvalidMissingValueInColumn(
@@ -129,7 +130,7 @@ def check_string_column_allowed_values(
129
130
  col_name: str,
130
131
  allowed_values: list[str],
131
132
  is_required: bool,
132
- ) -> list[InvalidMissingValueInColumn | InvalidStringValueNotAllowedInColumn]:
133
+ ) -> list[ValidationError]:
133
134
  """Validate that string column values are within allowed values.
134
135
 
135
136
  Args:
@@ -144,7 +145,7 @@ def check_string_column_allowed_values(
144
145
  if col_name not in df.columns:
145
146
  return []
146
147
 
147
- errors = []
148
+ errors: list[ValidationError] = []
148
149
  if is_required and df[col_name].isnull().any():
149
150
  errors.append(
150
151
  InvalidMissingValueInColumn(
@@ -177,7 +178,7 @@ def check_string_column_allowed_values(
177
178
  def check_float_column_valid_numbers(
178
179
  df: pd.DataFrame,
179
180
  col_name: str,
180
- ) -> list[InvalidFloatValueInColumn]:
181
+ ) -> list[ValidationError]:
181
182
  """Check that float column contains only finite numbers, no infinity values.
182
183
 
183
184
  Args:
@@ -201,11 +202,7 @@ def check_float_column_valid_numbers(
201
202
 
202
203
  def check_value_columns_start_end_time(
203
204
  df: pd.DataFrame,
204
- ) -> list[
205
- InvalidMissingValueInColumn
206
- | InvalidTimestampValueInColumn
207
- | InvalidStartAndEndTimeValuesInColumn
208
- ]:
205
+ ) -> list[ValidationError]:
209
206
  """Validate start and end time columns for timestamps and logical ordering.
210
207
 
211
208
  Args:
@@ -214,7 +211,7 @@ def check_value_columns_start_end_time(
214
211
  Returns:
215
212
  List of validation errors for missing values, invalid timestamps, or start > end.
216
213
  """
217
- errors = []
214
+ errors: list[ValidationError] = []
218
215
  errors += check_value_timestamp(
219
216
  df=df,
220
217
  col_name=SPAN_START_TIME_COL.name,
@@ -243,7 +240,7 @@ def check_value_timestamp(
243
240
  df: pd.DataFrame,
244
241
  col_name: str,
245
242
  is_required: bool,
246
- ) -> list[InvalidMissingValueInColumn | InvalidTimestampValueInColumn]:
243
+ ) -> list[ValidationError]:
247
244
  """Validate timestamp column values are within reasonable bounds.
248
245
 
249
246
  Args:
@@ -258,7 +255,7 @@ def check_value_timestamp(
258
255
  if col_name not in df.columns:
259
256
  return []
260
257
 
261
- errors = []
258
+ errors: list[ValidationError] = []
262
259
  if is_required and df[col_name].isnull().any():
263
260
  errors.append(
264
261
  InvalidMissingValueInColumn(
@@ -40,7 +40,7 @@ def check_eval_cols(
40
40
  Returns:
41
41
  List of validation errors found in evaluation columns.
42
42
  """
43
- checks = []
43
+ checks: list[list[ValidationError]] = []
44
44
  for col in dataframe.columns:
45
45
  if col.endswith(EVAL_LABEL_SUFFIX):
46
46
  checks.append(
@@ -39,7 +39,7 @@ def validate_argument_types(
39
39
  Returns:
40
40
  A list of validation errors, empty if none found
41
41
  """
42
- errors = []
42
+ errors: list[ValidationError] = []
43
43
 
44
44
  # Check metadata_dataframe type
45
45
  if not isinstance(metadata_dataframe, pd.DataFrame):
@@ -41,7 +41,7 @@ def validate_dataframe_form(
41
41
  Returns:
42
42
  A list of validation errors, empty if none found
43
43
  """
44
- errors = []
44
+ errors: list[ValidationError] = []
45
45
 
46
46
  # Check for empty dataframe
47
47
  if metadata_dataframe.empty:
@@ -34,6 +34,28 @@ class MetadataValueError(ValidationError):
34
34
  return f"{self.message} {self.resolution}"
35
35
 
36
36
 
37
+ class InvalidPatchDocumentFormat(ValidationError):
38
+ """Raised when patch document format is invalid or cannot be parsed."""
39
+
40
+ def __init__(self, row_idx: int, message: str) -> None:
41
+ """Initialize the exception with patch document format error context.
42
+
43
+ Args:
44
+ row_idx: The row index where the invalid patch was found.
45
+ message: Detailed error message describing the format issue.
46
+ """
47
+ self.row_idx = row_idx
48
+ self.message = message
49
+
50
+ def __repr__(self) -> str:
51
+ """Return a string representation for debugging and logging."""
52
+ return "Invalid_Patch_Document_Format"
53
+
54
+ def error_message(self) -> str:
55
+ """Return the error message for this exception."""
56
+ return f"Row {self.row_idx}: {self.message}"
57
+
58
+
37
59
  def calculate_json_depth(obj: object, current_depth: int = 1) -> int:
38
60
  """Calculate the maximum nesting depth of a JSON object.
39
61
 
@@ -67,7 +89,7 @@ def validate_values(
67
89
  Returns:
68
90
  A list of validation errors, empty if none found
69
91
  """
70
- errors = []
92
+ errors: list[ValidationError] = []
71
93
 
72
94
  # Skip validation if span_id column is not present
73
95
  if SPAN_SPAN_ID_COL.name not in metadata_dataframe.columns:
arize/utils/arrow.py CHANGED
@@ -1,4 +1,3 @@
1
- # type: ignore[pb2]
2
1
  """Apache Arrow utilities for data serialization and file operations."""
3
2
 
4
3
  from __future__ import annotations
@@ -124,6 +123,18 @@ def post_arrow_table(
124
123
  def _append_to_pyarrow_metadata(
125
124
  pa_schema: pa.Schema, new_metadata: dict[str, Any]
126
125
  ) -> object:
126
+ """Append metadata to a PyArrow schema without overwriting existing keys.
127
+
128
+ Args:
129
+ pa_schema: The PyArrow schema to add metadata to.
130
+ new_metadata: Dictionary of metadata key-value pairs to append.
131
+
132
+ Returns:
133
+ pa.Schema: A new PyArrow schema with the merged metadata.
134
+
135
+ Raises:
136
+ KeyError: If any keys in new_metadata conflict with existing schema metadata.
137
+ """
127
138
  # Ensure metadata is handled correctly, even if initially None.
128
139
  metadata = pa_schema.metadata
129
140
  if metadata is None:
@@ -145,6 +156,14 @@ def _append_to_pyarrow_metadata(
145
156
  def _write_arrow_file(
146
157
  path: str, pa_table: pa.Table, pa_schema: pa.Schema, max_chunksize: int
147
158
  ) -> None:
159
+ """Write a PyArrow table to an Arrow IPC file with specified schema and chunk size.
160
+
161
+ Args:
162
+ path: The file path where the Arrow file will be written.
163
+ pa_table: The PyArrow table containing the data to write.
164
+ pa_schema: The PyArrow schema to use for the file.
165
+ max_chunksize: Maximum number of rows per record batch chunk.
166
+ """
148
167
  with (
149
168
  pa.OSFile(path, mode="wb") as sink,
150
169
  pa.ipc.RecordBatchStreamWriter(sink, pa_schema) as writer,
@@ -153,6 +172,15 @@ def _write_arrow_file(
153
172
 
154
173
 
155
174
  def _maybe_log_project_url(response: requests.Response) -> None:
175
+ """Attempt to extract and log the Arize project URL from an HTTP response.
176
+
177
+ Args:
178
+ response: The HTTP response object from an Arize API request.
179
+
180
+ Notes:
181
+ Logs success message with URL if extraction succeeds, or warning if it fails.
182
+ This function never raises exceptions.
183
+ """
156
184
  try:
157
185
  url = get_arize_project_url(response)
158
186
  if url:
@@ -176,6 +204,14 @@ def _mktemp_in(directory: str) -> str:
176
204
 
177
205
 
178
206
  def _filesize(path: str) -> int:
207
+ """Get the size of a file in bytes.
208
+
209
+ Args:
210
+ path: The file path to check.
211
+
212
+ Returns:
213
+ int: The file size in bytes, or -1 if the file cannot be accessed.
214
+ """
179
215
  try:
180
216
  return os.path.getsize(path)
181
217
  except Exception:
@@ -81,9 +81,12 @@ def extract_nested_data_to_column(
81
81
  remainder = ".".join(parts[prefix_len:])
82
82
 
83
83
  # 3) Apply introspect row-by-row
84
+ # Type narrowing: prefix_col is guaranteed to be str after the None check above
85
+ prefix_col_str: str = prefix_col
86
+
84
87
  def apply_introspect_arize_attribute(
85
- row: pd.Series,
86
- prefix_col: str = prefix_col,
88
+ row: pd.Series, # type: ignore[type-arg]
89
+ prefix_col: str = prefix_col_str,
87
90
  remainder: str = remainder,
88
91
  ) -> object:
89
92
  val = row[prefix_col]
@@ -94,8 +97,9 @@ def extract_nested_data_to_column(
94
97
  else:
95
98
  return result if result is not None else np.nan
96
99
 
97
- result_df[attribute] = result_df.apply(
98
- apply_introspect_arize_attribute, axis=1
100
+ result_df[attribute] = result_df.apply( # type: ignore[call-overload]
101
+ apply_introspect_arize_attribute,
102
+ axis=1,
99
103
  )
100
104
 
101
105
  new_cols.append(attribute)
arize/utils/proto.py CHANGED
@@ -1,4 +1,3 @@
1
- # type: ignore[pb2]
2
1
  """Protocol buffer schema utilities for tracing data."""
3
2
 
4
3
  from arize._generated.protocol.rec import public_pb2 as pb2
arize/utils/types.py CHANGED
@@ -43,7 +43,7 @@ def is_array_of(arr: Sequence[object], tp: T) -> bool:
43
43
  return isinstance(arr, np.ndarray) and all(isinstance(x, tp) for x in arr)
44
44
 
45
45
 
46
- def is_list_of(lst: Sequence[object], tp: T) -> bool:
46
+ def is_list_of(lst: object, tp: T) -> bool:
47
47
  """Check if a value is a list with all elements of a specific type.
48
48
 
49
49
  Args:
@@ -70,10 +70,10 @@ def is_iterable_of(lst: Sequence[object], tp: T) -> bool:
70
70
 
71
71
 
72
72
  def is_dict_of(
73
- d: dict[object, object],
74
- key_allowed_types: T,
75
- value_allowed_types: T = (),
76
- value_list_allowed_types: T = (),
73
+ d: object,
74
+ key_allowed_types: type | tuple[type, ...],
75
+ value_allowed_types: type | tuple[type, ...] = (),
76
+ value_list_allowed_types: type | tuple[type, ...] = (),
77
77
  ) -> bool:
78
78
  """Method to check types are valid for dictionary.
79
79
 
@@ -98,7 +98,7 @@ def is_dict_of(
98
98
  and all(isinstance(k, key_allowed_types) for k in d)
99
99
  and all(
100
100
  isinstance(v, value_allowed_types)
101
- or any(is_list_of(v, t) for t in value_list_allowed_types)
101
+ or any(is_list_of(v, t) for t in value_list_allowed_types) # type: ignore[union-attr]
102
102
  for v in d.values()
103
103
  if value_allowed_types or value_list_allowed_types
104
104
  )