arize 8.0.0b1__py3-none-any.whl → 8.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. arize/__init__.py +9 -2
  2. arize/_client_factory.py +50 -0
  3. arize/_exporter/client.py +18 -17
  4. arize/_exporter/parsers/tracing_data_parser.py +9 -4
  5. arize/_exporter/validation.py +1 -1
  6. arize/_flight/client.py +37 -17
  7. arize/_generated/api_client/api/datasets_api.py +6 -6
  8. arize/_generated/api_client/api/experiments_api.py +6 -6
  9. arize/_generated/api_client/api/projects_api.py +3 -3
  10. arize/_lazy.py +61 -10
  11. arize/client.py +66 -50
  12. arize/config.py +175 -48
  13. arize/constants/config.py +1 -0
  14. arize/constants/ml.py +9 -16
  15. arize/constants/spans.py +5 -10
  16. arize/datasets/client.py +45 -28
  17. arize/datasets/errors.py +1 -1
  18. arize/datasets/validation.py +2 -2
  19. arize/embeddings/auto_generator.py +16 -9
  20. arize/embeddings/base_generators.py +15 -9
  21. arize/embeddings/cv_generators.py +2 -2
  22. arize/embeddings/errors.py +2 -2
  23. arize/embeddings/nlp_generators.py +8 -8
  24. arize/embeddings/tabular_generators.py +6 -6
  25. arize/exceptions/base.py +0 -52
  26. arize/exceptions/config.py +22 -0
  27. arize/exceptions/parameters.py +1 -330
  28. arize/exceptions/values.py +8 -5
  29. arize/experiments/__init__.py +4 -0
  30. arize/experiments/client.py +31 -18
  31. arize/experiments/evaluators/base.py +12 -9
  32. arize/experiments/evaluators/executors.py +16 -7
  33. arize/experiments/evaluators/rate_limiters.py +3 -1
  34. arize/experiments/evaluators/types.py +9 -7
  35. arize/experiments/evaluators/utils.py +7 -5
  36. arize/experiments/functions.py +128 -58
  37. arize/experiments/tracing.py +4 -1
  38. arize/experiments/types.py +34 -31
  39. arize/logging.py +54 -33
  40. arize/ml/batch_validation/errors.py +10 -1004
  41. arize/ml/batch_validation/validator.py +351 -291
  42. arize/ml/bounded_executor.py +25 -6
  43. arize/ml/casting.py +51 -33
  44. arize/ml/client.py +43 -35
  45. arize/ml/proto.py +21 -22
  46. arize/ml/stream_validation.py +64 -27
  47. arize/ml/surrogate_explainer/mimic.py +18 -10
  48. arize/ml/types.py +27 -67
  49. arize/pre_releases.py +10 -6
  50. arize/projects/client.py +9 -4
  51. arize/py.typed +0 -0
  52. arize/regions.py +11 -11
  53. arize/spans/client.py +125 -31
  54. arize/spans/columns.py +32 -36
  55. arize/spans/conversion.py +12 -11
  56. arize/spans/validation/annotations/dataframe_form_validation.py +1 -1
  57. arize/spans/validation/annotations/value_validation.py +11 -14
  58. arize/spans/validation/common/argument_validation.py +3 -3
  59. arize/spans/validation/common/dataframe_form_validation.py +7 -7
  60. arize/spans/validation/common/value_validation.py +11 -14
  61. arize/spans/validation/evals/dataframe_form_validation.py +4 -4
  62. arize/spans/validation/evals/evals_validation.py +6 -6
  63. arize/spans/validation/evals/value_validation.py +1 -1
  64. arize/spans/validation/metadata/argument_validation.py +1 -1
  65. arize/spans/validation/metadata/dataframe_form_validation.py +2 -2
  66. arize/spans/validation/metadata/value_validation.py +23 -1
  67. arize/spans/validation/spans/dataframe_form_validation.py +2 -2
  68. arize/spans/validation/spans/spans_validation.py +6 -6
  69. arize/utils/arrow.py +38 -2
  70. arize/utils/cache.py +2 -2
  71. arize/utils/dataframe.py +4 -4
  72. arize/utils/online_tasks/dataframe_preprocessor.py +15 -11
  73. arize/utils/openinference_conversion.py +10 -10
  74. arize/utils/proto.py +0 -1
  75. arize/utils/types.py +6 -6
  76. arize/version.py +1 -1
  77. {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/METADATA +32 -7
  78. {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/RECORD +81 -78
  79. {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/WHEEL +0 -0
  80. {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/LICENSE +0 -0
  81. {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/NOTICE +0 -0
arize/spans/client.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Client implementation for managing spans and traces in the Arize platform."""
2
2
 
3
- # type: ignore[pb2]
4
3
  from __future__ import annotations
5
4
 
6
5
  import json
@@ -21,12 +20,16 @@ from arize._flight.types import FlightRequestType
21
20
  from arize.constants.spans import DEFAULT_DATETIME_FMT
22
21
  from arize.exceptions.base import (
23
22
  INVALID_ARROW_CONVERSION_MSG,
23
+ ValidationError,
24
24
  ValidationFailure,
25
25
  )
26
26
  from arize.exceptions.models import MissingProjectNameError
27
27
  from arize.exceptions.spaces import MissingSpaceIDError
28
28
  from arize.logging import CtxAdapter
29
29
  from arize.ml.types import Environments
30
+ from arize.spans.validation.metadata.value_validation import (
31
+ InvalidPatchDocumentFormat,
32
+ )
30
33
  from arize.utils.arrow import post_arrow_table
31
34
  from arize.utils.dataframe import (
32
35
  remove_extraneous_columns,
@@ -78,10 +81,11 @@ class SpansClient:
78
81
  Args:
79
82
  space_id: The space ID where the project resides.
80
83
  project_name: A unique name to identify your project in the Arize platform.
81
- dataframe: The dataframe containing the LLM traces.
82
- evals_dataframe: A dataframe containing LLM evaluations data.
83
- The evaluations are joined to their corresponding spans via a left outer join, i.e.,
84
- using only `context.span_id` from the spans dataframe. Defaults to None.
84
+ dataframe (:class:`pandas.DataFrame`): The dataframe containing the LLM traces.
85
+ evals_dataframe (:class:`pandas.DataFrame` | :obj:`None`): A dataframe containing
86
+ LLM evaluations data. The evaluations are joined to their corresponding spans
87
+ via a left outer join, i.e., using only `context.span_id` from the spans
88
+ dataframe. Defaults to None.
85
89
  datetime_format: format for the timestamp captured in the LLM traces.
86
90
  Defaults to "%Y-%m-%dT%H:%M:%S.%f+00:00".
87
91
  validate: When set to True, validation is run before sending data.
@@ -280,7 +284,7 @@ class SpansClient:
280
284
  Args:
281
285
  space_id: The space ID where the project resides.
282
286
  project_name: A unique name to identify your project in the Arize platform.
283
- dataframe: A dataframe containing LLM evaluations data.
287
+ dataframe (:class:`pandas.DataFrame`): A dataframe containing LLM evaluations data.
284
288
  validate: When set to True, validation is run before sending data.
285
289
  Defaults to True.
286
290
  force_http: Force the use of HTTP for data upload. Defaults to False.
@@ -453,7 +457,7 @@ class SpansClient:
453
457
  Args:
454
458
  space_id: The space ID where the project resides.
455
459
  project_name: A unique name to identify your project in the Arize platform.
456
- dataframe: A dataframe containing LLM annotation data.
460
+ dataframe (:class:`pandas.DataFrame`): A dataframe containing LLM annotation data.
457
461
  validate: When set to True, validation is run before sending data.
458
462
  Defaults to True.
459
463
  """
@@ -684,7 +688,8 @@ class SpansClient:
684
688
  Args:
685
689
  space_id: The space ID where the project resides.
686
690
  project_name: A unique name to identify your project in the Arize platform.
687
- dataframe: DataFrame with span_ids and either patch documents or metadata field columns.
691
+ dataframe (:class:`pandas.DataFrame`): DataFrame with span_ids and either patch
692
+ documents or metadata field columns.
688
693
  patch_document_column_name: Name of the column containing JSON patch documents.
689
694
  Defaults to "patch_document".
690
695
  validate: When set to True, validation is run before sending data.
@@ -848,7 +853,8 @@ class SpansClient:
848
853
  )
849
854
  for idx in range(len(metadata_df))
850
855
  ]
851
- metadata_df[final_patch_column] = merged_patches
856
+ # Type ignore: pandas DataFrame column assignment type is overly restrictive
857
+ metadata_df[final_patch_column] = merged_patches # type: ignore[assignment]
852
858
  else:
853
859
  # Just use the field patches directly
854
860
  metadata_df[final_patch_column] = field_patches
@@ -885,7 +891,8 @@ class SpansClient:
885
891
  log.error(e)
886
892
  raise ValidationFailure(validation_errors)
887
893
 
888
- metadata_df[final_patch_column] = processed_patches
894
+ # Type ignore: pandas DataFrame column assignment type is overly restrictive
895
+ metadata_df[final_patch_column] = processed_patches # type: ignore[assignment]
889
896
 
890
897
  # Run validations on the processed dataframe
891
898
  if validate:
@@ -1004,14 +1011,14 @@ class SpansClient:
1004
1011
  columns: list | None = None,
1005
1012
  stream_chunk_size: int | None = None,
1006
1013
  ) -> pd.DataFrame:
1007
- """Export span data from Arize to a pandas DataFrame.
1014
+ """Export span data from Arize to a :class:`pandas.DataFrame`.
1008
1015
 
1009
1016
  Retrieves trace/span data from the specified project within a time range
1010
- and returns it as a pandas DataFrame. Supports filtering with SQL-like
1017
+ and returns it as a :class:`pandas.DataFrame`. Supports filtering with SQL-like
1011
1018
  WHERE clauses and similarity search for semantic retrieval.
1012
1019
 
1013
1020
  Returns:
1014
- DataFrame containing the requested span data with columns
1021
+ :class:`pandas.DataFrame`: DataFrame containing the requested span data with columns
1015
1022
  for span metadata, attributes, events, and any custom fields.
1016
1023
  """
1017
1024
  with ArizeFlightClient(
@@ -1052,8 +1059,26 @@ class SpansClient:
1052
1059
 
1053
1060
  Retrieves trace/span data from the specified project within a time range
1054
1061
  and writes it directly to a Parquet file at the specified path. Supports
1055
- filtering with SQL-like WHERE clauses and similarity search for semantic
1056
- retrieval. Efficient for large datasets and long-term storage.
1062
+ filtering with SQL-like WHERE clauses for efficient querying. Ideal for
1063
+ large datasets and long-term storage.
1064
+
1065
+ Args:
1066
+ path: The file path where the Parquet file will be written.
1067
+ space_id: The space ID where the project resides.
1068
+ project_name: The name of the project to export span data from.
1069
+ start_time: Start of the time range (inclusive) as a datetime object.
1070
+ end_time: End of the time range (inclusive) as a datetime object.
1071
+ where: Optional SQL-like WHERE clause to filter rows (e.g., "span.status_code = 'ERROR'").
1072
+ columns: Optional list of column names to include. If None, all columns are returned.
1073
+ stream_chunk_size: Optional chunk size for streaming large result sets.
1074
+
1075
+ Raises:
1076
+ RuntimeError: If the Flight client request fails or returns no response.
1077
+
1078
+ Notes:
1079
+ - Uses Apache Arrow Flight for efficient data transfer
1080
+ - Data is written directly to the specified path as a Parquet file
1081
+ - Large exports may benefit from specifying stream_chunk_size
1057
1082
  """
1058
1083
  with ArizeFlightClient(
1059
1084
  api_key=self._sdk_config.api_key,
@@ -1066,7 +1091,7 @@ class SpansClient:
1066
1091
  exporter = ArizeExportClient(
1067
1092
  flight_client=flight_client,
1068
1093
  )
1069
- return exporter.export_to_parquet(
1094
+ exporter.export_to_parquet(
1070
1095
  path=path,
1071
1096
  space_id=space_id,
1072
1097
  model_id=project_name,
@@ -1080,6 +1105,15 @@ class SpansClient:
1080
1105
 
1081
1106
 
1082
1107
  def _build_patch_document(row: pd.Series) -> dict[str, object]:
1108
+ """Build a patch document from a pandas Series row by extracting metadata fields.
1109
+
1110
+ Args:
1111
+ row: A pandas Series representing a row of data with potential metadata columns.
1112
+
1113
+ Returns:
1114
+ dict[str, object]: A dictionary mapping metadata field names (without the
1115
+ 'attributes.metadata.' prefix) to their values, preserving arrays and scalars.
1116
+ """
1083
1117
  # Extract and preserve metadata values with proper types
1084
1118
  patch = {}
1085
1119
  for key in row.index:
@@ -1101,9 +1135,21 @@ def _build_patch_document(row: pd.Series) -> dict[str, object]:
1101
1135
  def _process_patch_document(
1102
1136
  metadata_df: pd.DataFrame,
1103
1137
  patch_document_column_name: str,
1104
- field_patches: pd.DataFrame,
1138
+ field_patches: pd.Series[Any],
1105
1139
  row_idx: int,
1106
1140
  ) -> dict[str, object]:
1141
+ """Process and merge patch documents from field patches and explicit patch column.
1142
+
1143
+ Args:
1144
+ metadata_df: DataFrame containing the metadata with patch documents.
1145
+ patch_document_column_name: Name of the column containing explicit patch documents.
1146
+ field_patches: DataFrame containing patches derived from individual metadata fields.
1147
+ row_idx: The row index to process.
1148
+
1149
+ Returns:
1150
+ dict[str, object]: Merged patch document where explicit patches take precedence over
1151
+ field patches. Returns empty dict if patch document is invalid or missing.
1152
+ """
1107
1153
  # Get the field patch for this row
1108
1154
  field_patch = field_patches.iloc[row_idx]
1109
1155
 
@@ -1150,9 +1196,21 @@ def _ensure_dict_patch(
1150
1196
  metadata_df: pd.DataFrame,
1151
1197
  final_patch_column: str,
1152
1198
  row_idx: int,
1153
- ) -> tuple[dict[str, object], list[str]]:
1199
+ ) -> tuple[dict[str, object], list[ValidationError]]:
1200
+ """Ensure a patch value is a dictionary, converting from JSON string if needed.
1201
+
1202
+ Args:
1203
+ metadata_df: DataFrame containing the patch data.
1204
+ final_patch_column: Name of the column containing the final patch document.
1205
+ row_idx: The row index to process.
1206
+
1207
+ Returns:
1208
+ tuple[dict[str, object], list[ValidationError]]: A tuple containing:
1209
+ - The patch as a dictionary (empty dict if invalid or missing)
1210
+ - List of validation errors (empty if no errors)
1211
+ """
1154
1212
  patch = metadata_df.loc[row_idx, final_patch_column]
1155
- validation_errors = []
1213
+ validation_errors: list[ValidationError] = []
1156
1214
 
1157
1215
  # For None/null values, return an empty dict
1158
1216
  if patch is None:
@@ -1171,25 +1229,26 @@ def _ensure_dict_patch(
1171
1229
  try:
1172
1230
  parsed = json.loads(patch)
1173
1231
  if isinstance(parsed, dict):
1174
- return parsed
1232
+ return parsed, validation_errors
1175
1233
  except json.JSONDecodeError as e:
1176
- error_msg = f"Row {row_idx}: Invalid JSON in patch document: {e}"
1177
- logger.warning(error_msg)
1178
- validation_errors.append(error_msg)
1234
+ error_msg = f"Invalid JSON in patch document: {e}"
1235
+ logger.warning(f"Row {row_idx}: {error_msg}")
1236
+ validation_errors.append(
1237
+ InvalidPatchDocumentFormat(row_idx, error_msg)
1238
+ )
1179
1239
  return {}, validation_errors # if not validate else None
1180
1240
  else:
1181
- error_msg = (
1182
- f"Row {row_idx}: JSON must be an object/dictionary, "
1183
- f"got {type(parsed).__name__}"
1241
+ error_msg = f"JSON must be an object/dictionary, got {type(parsed).__name__}"
1242
+ logger.warning(f"Row {row_idx}: {error_msg}")
1243
+ validation_errors.append(
1244
+ InvalidPatchDocumentFormat(row_idx, error_msg)
1184
1245
  )
1185
- logger.warning(error_msg)
1186
- validation_errors.append(error_msg)
1187
1246
  return {}, validation_errors # if not validate else None
1188
1247
 
1189
1248
  # For other types, log warning
1190
- error_msg = f"Row {row_idx}: Unsupported patch type: {type(patch).__name__}"
1191
- logger.warning(error_msg)
1192
- validation_errors.append(error_msg)
1249
+ error_msg = f"Unsupported patch type: {type(patch).__name__}"
1250
+ logger.warning(f"Row {row_idx}: {error_msg}")
1251
+ validation_errors.append(InvalidPatchDocumentFormat(row_idx, error_msg))
1193
1252
  return {}, validation_errors # if not validate else None
1194
1253
 
1195
1254
 
@@ -1197,6 +1256,16 @@ def _format_note_for_storage(
1197
1256
  note_text: str,
1198
1257
  current_time_ms: int,
1199
1258
  ) -> list[str] | None:
1259
+ """Format a note text into a JSON-serialized list for storage.
1260
+
1261
+ Args:
1262
+ note_text: The note text content to format.
1263
+ current_time_ms: The current timestamp in milliseconds.
1264
+
1265
+ Returns:
1266
+ list[str] | None: A list containing a single JSON string with note metadata
1267
+ (text, updated_by, updated_at), or None if note_text is NaN/missing.
1268
+ """
1200
1269
  if pd.isna(note_text):
1201
1270
  return None
1202
1271
  note_obj = {
@@ -1213,6 +1282,19 @@ def _log_flight_update_summary(
1213
1282
  request_type: FlightRequestType,
1214
1283
  response: FlightPostArrowFileResponse,
1215
1284
  ) -> None:
1285
+ """Log a structured summary of Flight update results including metrics and errors.
1286
+
1287
+ Args:
1288
+ project_name: Name of the project being updated.
1289
+ total_spans: Total number of spans in the update request.
1290
+ request_type: The type of Flight request being performed.
1291
+ response: The Flight response object containing update results and errors.
1292
+
1293
+ Notes:
1294
+ Logs one summary line with aggregated metrics, plus individual error lines
1295
+ for any failed span updates. Metrics include success rate, spans processed,
1296
+ and failure counts.
1297
+ """
1216
1298
  spans_updated = getattr(response, "spans_updated", None)
1217
1299
  if spans_updated is None:
1218
1300
  # Fallback for older response types
@@ -1276,6 +1358,18 @@ def _message_to_dict(
1276
1358
  preserve_names: bool = True,
1277
1359
  use_int_enums: bool = False,
1278
1360
  ) -> dict[str, object]:
1361
+ """Convert a protobuf Message to a dictionary representation.
1362
+
1363
+ Args:
1364
+ msg: The protobuf Message to convert.
1365
+ preserve_names: If True, preserve original proto field names. If False, use
1366
+ lowerCamelCase names. Defaults to True.
1367
+ use_int_enums: If True, represent enum values as integers. If False, use
1368
+ enum string names. Defaults to False.
1369
+
1370
+ Returns:
1371
+ dict[str, object]: Dictionary representation of the protobuf message.
1372
+ """
1279
1373
  return json_format.MessageToDict(
1280
1374
  msg,
1281
1375
  preserving_proto_field_name=preserve_names,
arize/spans/columns.py CHANGED
@@ -39,8 +39,6 @@ class SpanColumn:
39
39
  self.data_type = data_type
40
40
 
41
41
 
42
- #
43
- #
44
42
  # Root level columns
45
43
  SPAN_TRACE_ID_COL = SpanColumn(
46
44
  name="context.trace_id",
@@ -96,18 +94,18 @@ SPAN_KIND_COL = SpanColumn(
96
94
  data_type=SpanColumnDataType.STRING,
97
95
  )
98
96
  # Attributes Exception columns
99
- # SPAN_ATTRIBUTES_EXCEPTION_TYPE_COL = SpanColumn(
100
- # name=f"attributes.{otel.SpanAttributes.EXCEPTION_TYPE}",
101
- # data_type=SpanColumnDataType.STRING,
102
- # )
97
+ SPAN_ATTRIBUTES_EXCEPTION_TYPE_COL = SpanColumn(
98
+ name=f"attributes.{otel.SpanAttributes.EXCEPTION_TYPE}",
99
+ data_type=SpanColumnDataType.STRING,
100
+ )
103
101
  SPAN_ATTRIBUTES_EXCEPTION_MESSAGE_COL = SpanColumn(
104
102
  name=f"attributes.{otel.SpanAttributes.EXCEPTION_MESSAGE}",
105
103
  data_type=SpanColumnDataType.STRING,
106
104
  )
107
- # SPAN_ATTRIBUTES_EXCEPTION_ESCAPED_COL = SpanColumn(
108
- # name=f"attributes.{otel.SpanAttributes.EXCEPTION_ESCAPED}",
109
- # data_type=SpanColumnDataType.BOOL,
110
- # )
105
+ SPAN_ATTRIBUTES_EXCEPTION_ESCAPED_COL = SpanColumn(
106
+ name=f"attributes.{otel.SpanAttributes.EXCEPTION_ESCAPED}",
107
+ data_type=SpanColumnDataType.BOOL,
108
+ )
111
109
  SPAN_ATTRIBUTES_EXCEPTION_STACKTRACE_COL = SpanColumn(
112
110
  name=f"attributes.{otel.SpanAttributes.EXCEPTION_STACKTRACE}",
113
111
  data_type=SpanColumnDataType.STRING,
@@ -176,20 +174,19 @@ SPAN_ATTRIBUTES_LLM_PROMPT_TEMPLATE_VERSION_COL = SpanColumn(
176
174
  name=f"attributes.{oinf.SpanAttributes.LLM_PROMPT_TEMPLATE_VERSION}",
177
175
  data_type=SpanColumnDataType.STRING,
178
176
  )
179
- # SPAN_ATTRIBUTES_LLM_PROMPT_TOKEN_COUNT_COL = SpanColumn(
180
- # name=f"attributes.{oinf.SpanAttributes.LLM_TOKEN_COUNT_PROMPT}",
181
- # data_type=SpanColumnDataType.NUMERIC,
182
- # )
183
- # SPAN_ATTRIBUTES_LLM_COMPLETION_TOKEN_COUNT_COL = SpanColumn(
184
- # name=f"attributes.{oinf.SpanAttributes.LLM_TOKEN_COUNT_COMPLETION}",
185
- # data_type=SpanColumnDataType.NUMERIC,
186
- # )
187
- # SPAN_ATTRIBUTES_LLM_TOTAL_TOKEN_COUNT_COL = SpanColumn(
188
- # name=f"attributes.{oinf.SpanAttributes.LLM_TOKEN_COUNT_TOTAL}",
189
- # data_type=SpanColumnDataType.NUMERIC,
190
- # )
177
+ SPAN_ATTRIBUTES_LLM_PROMPT_TOKEN_COUNT_COL = SpanColumn(
178
+ name=f"attributes.{oinf.SpanAttributes.LLM_TOKEN_COUNT_PROMPT}",
179
+ data_type=SpanColumnDataType.NUMERIC,
180
+ )
181
+ SPAN_ATTRIBUTES_LLM_COMPLETION_TOKEN_COUNT_COL = SpanColumn(
182
+ name=f"attributes.{oinf.SpanAttributes.LLM_TOKEN_COUNT_COMPLETION}",
183
+ data_type=SpanColumnDataType.NUMERIC,
184
+ )
185
+ SPAN_ATTRIBUTES_LLM_TOTAL_TOKEN_COUNT_COL = SpanColumn(
186
+ name=f"attributes.{oinf.SpanAttributes.LLM_TOKEN_COUNT_TOTAL}",
187
+ data_type=SpanColumnDataType.NUMERIC,
188
+ )
191
189
  # Attributes Message Keys
192
- # SPAN_ATTRIBUTES_MESSAGE_NAME_KEY = f"{oinf.MessageAttributes.MESSAGE_NAME}"
193
190
  SPAN_ATTRIBUTES_MESSAGE_ROLE_KEY = f"{oinf.MessageAttributes.MESSAGE_ROLE}"
194
191
  SPAN_ATTRIBUTES_MESSAGE_CONTENT_KEY = (
195
192
  f"{oinf.MessageAttributes.MESSAGE_CONTENT}"
@@ -223,7 +220,6 @@ SPAN_ATTRIBUTES_RETRIEVAL_DOCUMENTS_COL = SpanColumn(
223
220
  )
224
221
  # Document Object Keys
225
222
  SPAN_ATTRIBUTES_DOCUMENT_ID_KEY = f"{oinf.DocumentAttributes.DOCUMENT_ID}"
226
- # SPAN_ATTRIBUTES_DOCUMENT_SCORE_KEY = f"{oinf.DocumentAttributes.DOCUMENT_SCORE}"
227
223
  SPAN_ATTRIBUTES_DOCUMENT_CONTENT_KEY = (
228
224
  f"{oinf.DocumentAttributes.DOCUMENT_CONTENT}"
229
225
  )
@@ -247,10 +243,10 @@ SPAN_ATTRIBUTES_RERANKER_MODEL_NAME_COL = SpanColumn(
247
243
  name=f"attributes.{oinf.RerankerAttributes.RERANKER_MODEL_NAME}",
248
244
  data_type=SpanColumnDataType.STRING,
249
245
  )
250
- # SPAN_ATTRIBUTES_RERANKER_TOP_K_COL = SpanColumn(
251
- # name=f"attributes.{oinf.RerankerAttributes.RERANKER_TOP_K}",
252
- # data_type=SpanColumnDataType.NUMERIC,
253
- # )
246
+ SPAN_ATTRIBUTES_RERANKER_TOP_K_COL = SpanColumn(
247
+ name=f"attributes.{oinf.RerankerAttributes.RERANKER_TOP_K}",
248
+ data_type=SpanColumnDataType.NUMERIC,
249
+ )
254
250
  SPAN_ATTRIBUTES_SESSION_ID = SpanColumn(
255
251
  name=f"attributes.{oinf.SpanAttributes.SESSION_ID}",
256
252
  data_type=SpanColumnDataType.STRING,
@@ -281,9 +277,9 @@ SPAN_OPENINFERENCE_COLUMNS = [
281
277
  SPAN_STATUS_CODE_COL,
282
278
  SPAN_STATUS_MESSAGE_COL,
283
279
  SPAN_EVENTS_COL,
284
- # SPAN_ATTRIBUTES_EXCEPTION_TYPE_COL,
280
+ SPAN_ATTRIBUTES_EXCEPTION_TYPE_COL,
285
281
  SPAN_ATTRIBUTES_EXCEPTION_MESSAGE_COL,
286
- # SPAN_ATTRIBUTES_EXCEPTION_ESCAPED_COL,
282
+ SPAN_ATTRIBUTES_EXCEPTION_ESCAPED_COL,
287
283
  SPAN_ATTRIBUTES_EXCEPTION_STACKTRACE_COL,
288
284
  SPAN_ATTRIBUTES_INPUT_VALUE_COL,
289
285
  SPAN_ATTRIBUTES_INPUT_MIME_TYPE_COL,
@@ -297,9 +293,9 @@ SPAN_OPENINFERENCE_COLUMNS = [
297
293
  SPAN_ATTRIBUTES_LLM_PROMPT_TEMPLATE_TEMPLATE_COL,
298
294
  SPAN_ATTRIBUTES_LLM_PROMPT_TEMPLATE_VARIABLES_COL,
299
295
  SPAN_ATTRIBUTES_LLM_PROMPT_TEMPLATE_VERSION_COL,
300
- # SPAN_ATTRIBUTES_LLM_PROMPT_TOKEN_COUNT_COL,
301
- # SPAN_ATTRIBUTES_LLM_COMPLETION_TOKEN_COUNT_COL,
302
- # SPAN_ATTRIBUTES_LLM_TOTAL_TOKEN_COUNT_COL,
296
+ SPAN_ATTRIBUTES_LLM_PROMPT_TOKEN_COUNT_COL,
297
+ SPAN_ATTRIBUTES_LLM_COMPLETION_TOKEN_COUNT_COL,
298
+ SPAN_ATTRIBUTES_LLM_TOTAL_TOKEN_COUNT_COL,
303
299
  SPAN_ATTRIBUTES_TOOL_NAME_COL,
304
300
  SPAN_ATTRIBUTES_TOOL_DESCRIPTION_COL,
305
301
  SPAN_ATTRIBUTES_TOOL_PARAMETERS_COL,
@@ -308,18 +304,18 @@ SPAN_OPENINFERENCE_COLUMNS = [
308
304
  SPAN_ATTRIBUTES_RERANKER_OUTPUT_DOCUMENTS_COL,
309
305
  SPAN_ATTRIBUTES_RERANKER_QUERY_COL,
310
306
  SPAN_ATTRIBUTES_RERANKER_MODEL_NAME_COL,
311
- # SPAN_ATTRIBUTES_RERANKER_TOP_K_COL,
307
+ SPAN_ATTRIBUTES_RERANKER_TOP_K_COL,
312
308
  SPAN_ATTRIBUTES_SESSION_ID,
313
309
  SPAN_ATTRIBUTES_USER_ID,
314
310
  SPAN_ATTRIBUTES_METADATA,
315
311
  SPAN_ATTRIBUTES_LLM_TOOLS_COL,
316
312
  ]
317
- #
313
+
318
314
  # List of columns that must be present in the dataframe
319
315
  SPAN_OPENINFERENCE_REQUIRED_COLUMNS = [
320
316
  col for col in SPAN_OPENINFERENCE_COLUMNS if col.required
321
317
  ]
322
- #
318
+
323
319
  # Eval columns
324
320
  # EVAL_COLUMN_PREFIX = "eval."
325
321
  # SESSION_EVAL_COLUMN_PREFIX = "session_eval."
arize/spans/conversion.py CHANGED
@@ -7,22 +7,21 @@ from datetime import datetime, timezone
7
7
  import numpy as np
8
8
  import pandas as pd
9
9
 
10
- # from arize.utils.logging import logger
11
10
  from arize.spans.columns import SPAN_OPENINFERENCE_COLUMNS, SpanColumnDataType
12
11
 
13
12
 
14
13
  def convert_timestamps(df: pd.DataFrame, fmt: str = "") -> pd.DataFrame:
15
- """Convert timestamp columns in a DataFrame to nanoseconds.
14
+ """Convert timestamp columns in a :class:`pandas.DataFrame` to nanoseconds.
16
15
 
17
16
  Args:
18
- df: The pandas DataFrame containing timestamp columns.
17
+ df: The :class:`pandas.DataFrame` containing timestamp columns.
19
18
  fmt: Optional datetime format string for parsing string timestamps. Defaults to "".
20
19
 
21
20
  Returns:
22
- The DataFrame with timestamp columns converted to nanoseconds.
21
+ The :class:`pandas.DataFrame` with timestamp columns converted to nanoseconds.
23
22
 
24
23
  Raises:
25
- KeyError: If required timestamp column is not found in DataFrame.
24
+ KeyError: If required timestamp column is not found in :class:`pandas.DataFrame`.
26
25
  """
27
26
  for col in SPAN_OPENINFERENCE_COLUMNS:
28
27
  if col.data_type != SpanColumnDataType.TIMESTAMP:
@@ -70,7 +69,7 @@ def jsonify_dictionaries(df: pd.DataFrame) -> pd.DataFrame:
70
69
  """Convert dictionary and list-of-dictionary columns to JSON strings.
71
70
 
72
71
  Args:
73
- df: The pandas DataFrame containing dictionary columns.
72
+ df: The :class:`pandas.DataFrame` containing dictionary columns.
74
73
 
75
74
  Returns:
76
75
  The DataFrame with dictionary columns converted to JSON strings.
@@ -121,17 +120,19 @@ def is_missing_value(value: object) -> bool:
121
120
  np.inf,
122
121
  -np.inf,
123
122
  )
124
- return value in assumed_missing_values or pd.isna(value)
123
+ return value in assumed_missing_values or pd.isna(value) # type: ignore[call-overload]
125
124
 
126
125
 
127
126
  def _jsonify_list_of_dicts(
128
127
  list_of_dicts: Iterable[dict[str, object]] | None,
129
128
  ) -> list[str]:
130
- if not isinstance(list_of_dicts, Iterable) and is_missing_value(
131
- list_of_dicts
132
- ):
129
+ if list_of_dicts is None or is_missing_value(list_of_dicts):
133
130
  return []
134
- return [_jsonify_dict(d) for d in list_of_dicts]
131
+ return [
132
+ result
133
+ for d in list_of_dicts
134
+ if (result := _jsonify_dict(d)) is not None
135
+ ]
135
136
 
136
137
 
137
138
  def _jsonify_dict(d: dict[str, object] | None) -> str | None:
@@ -65,7 +65,7 @@ def check_invalid_annotation_column_names(
65
65
  df: pd.DataFrame,
66
66
  ) -> list[ValidationError]:
67
67
  """Checks for columns that start with 'annotation.' but don't match the expected pattern."""
68
- errors = []
68
+ errors: list[ValidationError] = []
69
69
 
70
70
  invalid_annotation_columns = [
71
71
  col
@@ -78,7 +78,7 @@ def check_annotation_updated_at_timestamp(
78
78
  df: pd.DataFrame,
79
79
  col_name: str,
80
80
  is_required: bool,
81
- ) -> list[InvalidMissingValueInColumn | InvalidAnnotationTimestamp]:
81
+ ) -> list[ValidationError]:
82
82
  """Validates annotation timestamp values for validity and acceptable ranges.
83
83
 
84
84
  Checks that timestamp values are positive, not in the future, and satisfy
@@ -96,7 +96,7 @@ def check_annotation_updated_at_timestamp(
96
96
  if col_name not in df.columns:
97
97
  return []
98
98
 
99
- errors = []
99
+ errors: list[ValidationError] = []
100
100
  if is_required and df[col_name].isnull().any():
101
101
  errors.append(
102
102
  InvalidMissingValueInColumn(
@@ -131,7 +131,7 @@ def check_annotation_cols(
131
131
  dataframe: pd.DataFrame,
132
132
  ) -> list[ValidationError]:
133
133
  """Checks value length and validity for columns matching annotation patterns."""
134
- checks = []
134
+ checks: list[list[ValidationError]] = []
135
135
  for col in dataframe.columns:
136
136
  if col.endswith(ANNOTATION_LABEL_SUFFIX):
137
137
  checks.append(
@@ -140,7 +140,8 @@ def check_annotation_cols(
140
140
  col_name=col,
141
141
  min_len=ANNOTATION_LABEL_MIN_STR_LENGTH,
142
142
  max_len=ANNOTATION_LABEL_MAX_STR_LENGTH,
143
- is_required=False, # Individual columns are not required, null check handles completeness
143
+ # Individual columns are not required
144
+ is_required=False,
144
145
  )
145
146
  )
146
147
  elif col.endswith(ANNOTATION_SCORE_SUFFIX):
@@ -231,15 +232,11 @@ def check_annotation_notes_column(
231
232
  col_name = ANNOTATION_NOTES_COLUMN_NAME
232
233
  if col_name in dataframe.columns:
233
234
  # Validate the length of the raw string
234
- return list(
235
- chain(
236
- *common_value_validation.check_string_column_value_length(
237
- df=dataframe,
238
- col_name=col_name,
239
- min_len=0, # Allow empty notes
240
- max_len=ANNOTATION_NOTES_MAX_STR_LENGTH,
241
- is_required=False,
242
- )
243
- )
235
+ return common_value_validation.check_string_column_value_length(
236
+ df=dataframe,
237
+ col_name=col_name,
238
+ min_len=0, # Allow empty notes
239
+ max_len=ANNOTATION_NOTES_MAX_STR_LENGTH,
240
+ is_required=False,
244
241
  )
245
242
  return []
@@ -39,13 +39,13 @@ def check_field_convertible_to_str(
39
39
  def check_dataframe_type(
40
40
  dataframe: object,
41
41
  ) -> list[InvalidTypeArgument]:
42
- """Validates that the provided argument is a pandas DataFrame.
42
+ """Validates that the provided argument is a :class:`pandas.DataFrame`.
43
43
 
44
44
  Args:
45
- dataframe: The object to validate as a pandas DataFrame.
45
+ dataframe: The object to validate as a :class:`pandas.DataFrame`.
46
46
 
47
47
  Returns:
48
- List of validation errors if not a DataFrame (empty if valid).
48
+ List of validation errors if not a :class:`pandas.DataFrame` (empty if valid).
49
49
  """
50
50
  if not isinstance(dataframe, pd.DataFrame):
51
51
  return [
@@ -17,10 +17,10 @@ if TYPE_CHECKING:
17
17
  def check_dataframe_index(
18
18
  dataframe: pd.DataFrame,
19
19
  ) -> list[InvalidDataFrameIndex]:
20
- """Validates that the DataFrame has a default integer index.
20
+ """Validates that the :class:`pandas.DataFrame` has a default integer index.
21
21
 
22
22
  Args:
23
- dataframe: The DataFrame to validate.
23
+ dataframe: The :class:`pandas.DataFrame` to validate.
24
24
 
25
25
  Returns:
26
26
  List of validation errors if index is not default (empty if valid).
@@ -34,10 +34,10 @@ def check_dataframe_required_column_set(
34
34
  df: pd.DataFrame,
35
35
  required_columns: list[str],
36
36
  ) -> list[InvalidDataFrameMissingColumns]:
37
- """Validates that the DataFrame contains all required columns.
37
+ """Validates that the :class:`pandas.DataFrame` contains all required columns.
38
38
 
39
39
  Args:
40
- df: The DataFrame to validate.
40
+ df: The :class:`pandas.DataFrame` to validate.
41
41
  required_columns: List of column names that must be present.
42
42
 
43
43
  Returns:
@@ -56,10 +56,10 @@ def check_dataframe_required_column_set(
56
56
  def check_dataframe_for_duplicate_columns(
57
57
  df: pd.DataFrame,
58
58
  ) -> list[InvalidDataFrameDuplicateColumns]:
59
- """Validates that the DataFrame has no duplicate column names.
59
+ """Validates that the :class:`pandas.DataFrame` has no duplicate column names.
60
60
 
61
61
  Args:
62
- df: The DataFrame to validate.
62
+ df: The :class:`pandas.DataFrame` to validate.
63
63
 
64
64
  Returns:
65
65
  List of validation errors if duplicate columns exist (empty if valid).
@@ -67,5 +67,5 @@ def check_dataframe_for_duplicate_columns(
67
67
  # Get the duplicated column names from the dataframe
68
68
  duplicate_columns = df.columns[df.columns.duplicated()]
69
69
  if not duplicate_columns.empty:
70
- return [InvalidDataFrameDuplicateColumns(duplicate_columns)]
70
+ return [InvalidDataFrameDuplicateColumns(duplicate_columns.tolist())]
71
71
  return []