arize 8.0.0b1__py3-none-any.whl → 8.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +9 -2
- arize/_client_factory.py +50 -0
- arize/_exporter/client.py +18 -17
- arize/_exporter/parsers/tracing_data_parser.py +9 -4
- arize/_exporter/validation.py +1 -1
- arize/_flight/client.py +37 -17
- arize/_generated/api_client/api/datasets_api.py +6 -6
- arize/_generated/api_client/api/experiments_api.py +6 -6
- arize/_generated/api_client/api/projects_api.py +3 -3
- arize/_lazy.py +61 -10
- arize/client.py +66 -50
- arize/config.py +175 -48
- arize/constants/config.py +1 -0
- arize/constants/ml.py +9 -16
- arize/constants/spans.py +5 -10
- arize/datasets/client.py +45 -28
- arize/datasets/errors.py +1 -1
- arize/datasets/validation.py +2 -2
- arize/embeddings/auto_generator.py +16 -9
- arize/embeddings/base_generators.py +15 -9
- arize/embeddings/cv_generators.py +2 -2
- arize/embeddings/errors.py +2 -2
- arize/embeddings/nlp_generators.py +8 -8
- arize/embeddings/tabular_generators.py +6 -6
- arize/exceptions/base.py +0 -52
- arize/exceptions/config.py +22 -0
- arize/exceptions/parameters.py +1 -330
- arize/exceptions/values.py +8 -5
- arize/experiments/__init__.py +4 -0
- arize/experiments/client.py +31 -18
- arize/experiments/evaluators/base.py +12 -9
- arize/experiments/evaluators/executors.py +16 -7
- arize/experiments/evaluators/rate_limiters.py +3 -1
- arize/experiments/evaluators/types.py +9 -7
- arize/experiments/evaluators/utils.py +7 -5
- arize/experiments/functions.py +128 -58
- arize/experiments/tracing.py +4 -1
- arize/experiments/types.py +34 -31
- arize/logging.py +54 -33
- arize/ml/batch_validation/errors.py +10 -1004
- arize/ml/batch_validation/validator.py +351 -291
- arize/ml/bounded_executor.py +25 -6
- arize/ml/casting.py +51 -33
- arize/ml/client.py +43 -35
- arize/ml/proto.py +21 -22
- arize/ml/stream_validation.py +64 -27
- arize/ml/surrogate_explainer/mimic.py +18 -10
- arize/ml/types.py +27 -67
- arize/pre_releases.py +10 -6
- arize/projects/client.py +9 -4
- arize/py.typed +0 -0
- arize/regions.py +11 -11
- arize/spans/client.py +125 -31
- arize/spans/columns.py +32 -36
- arize/spans/conversion.py +12 -11
- arize/spans/validation/annotations/dataframe_form_validation.py +1 -1
- arize/spans/validation/annotations/value_validation.py +11 -14
- arize/spans/validation/common/argument_validation.py +3 -3
- arize/spans/validation/common/dataframe_form_validation.py +7 -7
- arize/spans/validation/common/value_validation.py +11 -14
- arize/spans/validation/evals/dataframe_form_validation.py +4 -4
- arize/spans/validation/evals/evals_validation.py +6 -6
- arize/spans/validation/evals/value_validation.py +1 -1
- arize/spans/validation/metadata/argument_validation.py +1 -1
- arize/spans/validation/metadata/dataframe_form_validation.py +2 -2
- arize/spans/validation/metadata/value_validation.py +23 -1
- arize/spans/validation/spans/dataframe_form_validation.py +2 -2
- arize/spans/validation/spans/spans_validation.py +6 -6
- arize/utils/arrow.py +38 -2
- arize/utils/cache.py +2 -2
- arize/utils/dataframe.py +4 -4
- arize/utils/online_tasks/dataframe_preprocessor.py +15 -11
- arize/utils/openinference_conversion.py +10 -10
- arize/utils/proto.py +0 -1
- arize/utils/types.py +6 -6
- arize/version.py +1 -1
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/METADATA +32 -7
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/RECORD +81 -78
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/WHEEL +0 -0
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/LICENSE +0 -0
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/NOTICE +0 -0
arize/spans/client.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Client implementation for managing spans and traces in the Arize platform."""
|
|
2
2
|
|
|
3
|
-
# type: ignore[pb2]
|
|
4
3
|
from __future__ import annotations
|
|
5
4
|
|
|
6
5
|
import json
|
|
@@ -21,12 +20,16 @@ from arize._flight.types import FlightRequestType
|
|
|
21
20
|
from arize.constants.spans import DEFAULT_DATETIME_FMT
|
|
22
21
|
from arize.exceptions.base import (
|
|
23
22
|
INVALID_ARROW_CONVERSION_MSG,
|
|
23
|
+
ValidationError,
|
|
24
24
|
ValidationFailure,
|
|
25
25
|
)
|
|
26
26
|
from arize.exceptions.models import MissingProjectNameError
|
|
27
27
|
from arize.exceptions.spaces import MissingSpaceIDError
|
|
28
28
|
from arize.logging import CtxAdapter
|
|
29
29
|
from arize.ml.types import Environments
|
|
30
|
+
from arize.spans.validation.metadata.value_validation import (
|
|
31
|
+
InvalidPatchDocumentFormat,
|
|
32
|
+
)
|
|
30
33
|
from arize.utils.arrow import post_arrow_table
|
|
31
34
|
from arize.utils.dataframe import (
|
|
32
35
|
remove_extraneous_columns,
|
|
@@ -78,10 +81,11 @@ class SpansClient:
|
|
|
78
81
|
Args:
|
|
79
82
|
space_id: The space ID where the project resides.
|
|
80
83
|
project_name: A unique name to identify your project in the Arize platform.
|
|
81
|
-
dataframe: The dataframe containing the LLM traces.
|
|
82
|
-
evals_dataframe: A dataframe containing
|
|
83
|
-
The evaluations are joined to their corresponding spans
|
|
84
|
-
using only `context.span_id` from the spans
|
|
84
|
+
dataframe (:class:`pandas.DataFrame`): The dataframe containing the LLM traces.
|
|
85
|
+
evals_dataframe (:class:`pandas.DataFrame` | :obj:`None`): A dataframe containing
|
|
86
|
+
LLM evaluations data. The evaluations are joined to their corresponding spans
|
|
87
|
+
via a left outer join, i.e., using only `context.span_id` from the spans
|
|
88
|
+
dataframe. Defaults to None.
|
|
85
89
|
datetime_format: format for the timestamp captured in the LLM traces.
|
|
86
90
|
Defaults to "%Y-%m-%dT%H:%M:%S.%f+00:00".
|
|
87
91
|
validate: When set to True, validation is run before sending data.
|
|
@@ -280,7 +284,7 @@ class SpansClient:
|
|
|
280
284
|
Args:
|
|
281
285
|
space_id: The space ID where the project resides.
|
|
282
286
|
project_name: A unique name to identify your project in the Arize platform.
|
|
283
|
-
dataframe: A dataframe containing LLM evaluations data.
|
|
287
|
+
dataframe (:class:`pandas.DataFrame`): A dataframe containing LLM evaluations data.
|
|
284
288
|
validate: When set to True, validation is run before sending data.
|
|
285
289
|
Defaults to True.
|
|
286
290
|
force_http: Force the use of HTTP for data upload. Defaults to False.
|
|
@@ -453,7 +457,7 @@ class SpansClient:
|
|
|
453
457
|
Args:
|
|
454
458
|
space_id: The space ID where the project resides.
|
|
455
459
|
project_name: A unique name to identify your project in the Arize platform.
|
|
456
|
-
dataframe: A dataframe containing LLM annotation data.
|
|
460
|
+
dataframe (:class:`pandas.DataFrame`): A dataframe containing LLM annotation data.
|
|
457
461
|
validate: When set to True, validation is run before sending data.
|
|
458
462
|
Defaults to True.
|
|
459
463
|
"""
|
|
@@ -684,7 +688,8 @@ class SpansClient:
|
|
|
684
688
|
Args:
|
|
685
689
|
space_id: The space ID where the project resides.
|
|
686
690
|
project_name: A unique name to identify your project in the Arize platform.
|
|
687
|
-
dataframe: DataFrame with span_ids and either patch
|
|
691
|
+
dataframe (:class:`pandas.DataFrame`): DataFrame with span_ids and either patch
|
|
692
|
+
documents or metadata field columns.
|
|
688
693
|
patch_document_column_name: Name of the column containing JSON patch documents.
|
|
689
694
|
Defaults to "patch_document".
|
|
690
695
|
validate: When set to True, validation is run before sending data.
|
|
@@ -848,7 +853,8 @@ class SpansClient:
|
|
|
848
853
|
)
|
|
849
854
|
for idx in range(len(metadata_df))
|
|
850
855
|
]
|
|
851
|
-
|
|
856
|
+
# Type ignore: pandas DataFrame column assignment type is overly restrictive
|
|
857
|
+
metadata_df[final_patch_column] = merged_patches # type: ignore[assignment]
|
|
852
858
|
else:
|
|
853
859
|
# Just use the field patches directly
|
|
854
860
|
metadata_df[final_patch_column] = field_patches
|
|
@@ -885,7 +891,8 @@ class SpansClient:
|
|
|
885
891
|
log.error(e)
|
|
886
892
|
raise ValidationFailure(validation_errors)
|
|
887
893
|
|
|
888
|
-
|
|
894
|
+
# Type ignore: pandas DataFrame column assignment type is overly restrictive
|
|
895
|
+
metadata_df[final_patch_column] = processed_patches # type: ignore[assignment]
|
|
889
896
|
|
|
890
897
|
# Run validations on the processed dataframe
|
|
891
898
|
if validate:
|
|
@@ -1004,14 +1011,14 @@ class SpansClient:
|
|
|
1004
1011
|
columns: list | None = None,
|
|
1005
1012
|
stream_chunk_size: int | None = None,
|
|
1006
1013
|
) -> pd.DataFrame:
|
|
1007
|
-
"""Export span data from Arize to a pandas
|
|
1014
|
+
"""Export span data from Arize to a :class:`pandas.DataFrame`.
|
|
1008
1015
|
|
|
1009
1016
|
Retrieves trace/span data from the specified project within a time range
|
|
1010
|
-
and returns it as a pandas
|
|
1017
|
+
and returns it as a :class:`pandas.DataFrame`. Supports filtering with SQL-like
|
|
1011
1018
|
WHERE clauses and similarity search for semantic retrieval.
|
|
1012
1019
|
|
|
1013
1020
|
Returns:
|
|
1014
|
-
DataFrame containing the requested span data with columns
|
|
1021
|
+
:class:`pandas.DataFrame`: DataFrame containing the requested span data with columns
|
|
1015
1022
|
for span metadata, attributes, events, and any custom fields.
|
|
1016
1023
|
"""
|
|
1017
1024
|
with ArizeFlightClient(
|
|
@@ -1052,8 +1059,26 @@ class SpansClient:
|
|
|
1052
1059
|
|
|
1053
1060
|
Retrieves trace/span data from the specified project within a time range
|
|
1054
1061
|
and writes it directly to a Parquet file at the specified path. Supports
|
|
1055
|
-
filtering with SQL-like WHERE clauses
|
|
1056
|
-
|
|
1062
|
+
filtering with SQL-like WHERE clauses for efficient querying. Ideal for
|
|
1063
|
+
large datasets and long-term storage.
|
|
1064
|
+
|
|
1065
|
+
Args:
|
|
1066
|
+
path: The file path where the Parquet file will be written.
|
|
1067
|
+
space_id: The space ID where the project resides.
|
|
1068
|
+
project_name: The name of the project to export span data from.
|
|
1069
|
+
start_time: Start of the time range (inclusive) as a datetime object.
|
|
1070
|
+
end_time: End of the time range (inclusive) as a datetime object.
|
|
1071
|
+
where: Optional SQL-like WHERE clause to filter rows (e.g., "span.status_code = 'ERROR'").
|
|
1072
|
+
columns: Optional list of column names to include. If None, all columns are returned.
|
|
1073
|
+
stream_chunk_size: Optional chunk size for streaming large result sets.
|
|
1074
|
+
|
|
1075
|
+
Raises:
|
|
1076
|
+
RuntimeError: If the Flight client request fails or returns no response.
|
|
1077
|
+
|
|
1078
|
+
Notes:
|
|
1079
|
+
- Uses Apache Arrow Flight for efficient data transfer
|
|
1080
|
+
- Data is written directly to the specified path as a Parquet file
|
|
1081
|
+
- Large exports may benefit from specifying stream_chunk_size
|
|
1057
1082
|
"""
|
|
1058
1083
|
with ArizeFlightClient(
|
|
1059
1084
|
api_key=self._sdk_config.api_key,
|
|
@@ -1066,7 +1091,7 @@ class SpansClient:
|
|
|
1066
1091
|
exporter = ArizeExportClient(
|
|
1067
1092
|
flight_client=flight_client,
|
|
1068
1093
|
)
|
|
1069
|
-
|
|
1094
|
+
exporter.export_to_parquet(
|
|
1070
1095
|
path=path,
|
|
1071
1096
|
space_id=space_id,
|
|
1072
1097
|
model_id=project_name,
|
|
@@ -1080,6 +1105,15 @@ class SpansClient:
|
|
|
1080
1105
|
|
|
1081
1106
|
|
|
1082
1107
|
def _build_patch_document(row: pd.Series) -> dict[str, object]:
|
|
1108
|
+
"""Build a patch document from a pandas Series row by extracting metadata fields.
|
|
1109
|
+
|
|
1110
|
+
Args:
|
|
1111
|
+
row: A pandas Series representing a row of data with potential metadata columns.
|
|
1112
|
+
|
|
1113
|
+
Returns:
|
|
1114
|
+
dict[str, object]: A dictionary mapping metadata field names (without the
|
|
1115
|
+
'attributes.metadata.' prefix) to their values, preserving arrays and scalars.
|
|
1116
|
+
"""
|
|
1083
1117
|
# Extract and preserve metadata values with proper types
|
|
1084
1118
|
patch = {}
|
|
1085
1119
|
for key in row.index:
|
|
@@ -1101,9 +1135,21 @@ def _build_patch_document(row: pd.Series) -> dict[str, object]:
|
|
|
1101
1135
|
def _process_patch_document(
|
|
1102
1136
|
metadata_df: pd.DataFrame,
|
|
1103
1137
|
patch_document_column_name: str,
|
|
1104
|
-
field_patches: pd.
|
|
1138
|
+
field_patches: pd.Series[Any],
|
|
1105
1139
|
row_idx: int,
|
|
1106
1140
|
) -> dict[str, object]:
|
|
1141
|
+
"""Process and merge patch documents from field patches and explicit patch column.
|
|
1142
|
+
|
|
1143
|
+
Args:
|
|
1144
|
+
metadata_df: DataFrame containing the metadata with patch documents.
|
|
1145
|
+
patch_document_column_name: Name of the column containing explicit patch documents.
|
|
1146
|
+
field_patches: DataFrame containing patches derived from individual metadata fields.
|
|
1147
|
+
row_idx: The row index to process.
|
|
1148
|
+
|
|
1149
|
+
Returns:
|
|
1150
|
+
dict[str, object]: Merged patch document where explicit patches take precedence over
|
|
1151
|
+
field patches. Returns empty dict if patch document is invalid or missing.
|
|
1152
|
+
"""
|
|
1107
1153
|
# Get the field patch for this row
|
|
1108
1154
|
field_patch = field_patches.iloc[row_idx]
|
|
1109
1155
|
|
|
@@ -1150,9 +1196,21 @@ def _ensure_dict_patch(
|
|
|
1150
1196
|
metadata_df: pd.DataFrame,
|
|
1151
1197
|
final_patch_column: str,
|
|
1152
1198
|
row_idx: int,
|
|
1153
|
-
) -> tuple[dict[str, object], list[
|
|
1199
|
+
) -> tuple[dict[str, object], list[ValidationError]]:
|
|
1200
|
+
"""Ensure a patch value is a dictionary, converting from JSON string if needed.
|
|
1201
|
+
|
|
1202
|
+
Args:
|
|
1203
|
+
metadata_df: DataFrame containing the patch data.
|
|
1204
|
+
final_patch_column: Name of the column containing the final patch document.
|
|
1205
|
+
row_idx: The row index to process.
|
|
1206
|
+
|
|
1207
|
+
Returns:
|
|
1208
|
+
tuple[dict[str, object], list[ValidationError]]: A tuple containing:
|
|
1209
|
+
- The patch as a dictionary (empty dict if invalid or missing)
|
|
1210
|
+
- List of validation errors (empty if no errors)
|
|
1211
|
+
"""
|
|
1154
1212
|
patch = metadata_df.loc[row_idx, final_patch_column]
|
|
1155
|
-
validation_errors = []
|
|
1213
|
+
validation_errors: list[ValidationError] = []
|
|
1156
1214
|
|
|
1157
1215
|
# For None/null values, return an empty dict
|
|
1158
1216
|
if patch is None:
|
|
@@ -1171,25 +1229,26 @@ def _ensure_dict_patch(
|
|
|
1171
1229
|
try:
|
|
1172
1230
|
parsed = json.loads(patch)
|
|
1173
1231
|
if isinstance(parsed, dict):
|
|
1174
|
-
return parsed
|
|
1232
|
+
return parsed, validation_errors
|
|
1175
1233
|
except json.JSONDecodeError as e:
|
|
1176
|
-
error_msg = f"
|
|
1177
|
-
logger.warning(error_msg)
|
|
1178
|
-
validation_errors.append(
|
|
1234
|
+
error_msg = f"Invalid JSON in patch document: {e}"
|
|
1235
|
+
logger.warning(f"Row {row_idx}: {error_msg}")
|
|
1236
|
+
validation_errors.append(
|
|
1237
|
+
InvalidPatchDocumentFormat(row_idx, error_msg)
|
|
1238
|
+
)
|
|
1179
1239
|
return {}, validation_errors # if not validate else None
|
|
1180
1240
|
else:
|
|
1181
|
-
error_msg = (
|
|
1182
|
-
|
|
1183
|
-
|
|
1241
|
+
error_msg = f"JSON must be an object/dictionary, got {type(parsed).__name__}"
|
|
1242
|
+
logger.warning(f"Row {row_idx}: {error_msg}")
|
|
1243
|
+
validation_errors.append(
|
|
1244
|
+
InvalidPatchDocumentFormat(row_idx, error_msg)
|
|
1184
1245
|
)
|
|
1185
|
-
logger.warning(error_msg)
|
|
1186
|
-
validation_errors.append(error_msg)
|
|
1187
1246
|
return {}, validation_errors # if not validate else None
|
|
1188
1247
|
|
|
1189
1248
|
# For other types, log warning
|
|
1190
|
-
error_msg = f"
|
|
1191
|
-
logger.warning(error_msg)
|
|
1192
|
-
validation_errors.append(error_msg)
|
|
1249
|
+
error_msg = f"Unsupported patch type: {type(patch).__name__}"
|
|
1250
|
+
logger.warning(f"Row {row_idx}: {error_msg}")
|
|
1251
|
+
validation_errors.append(InvalidPatchDocumentFormat(row_idx, error_msg))
|
|
1193
1252
|
return {}, validation_errors # if not validate else None
|
|
1194
1253
|
|
|
1195
1254
|
|
|
@@ -1197,6 +1256,16 @@ def _format_note_for_storage(
|
|
|
1197
1256
|
note_text: str,
|
|
1198
1257
|
current_time_ms: int,
|
|
1199
1258
|
) -> list[str] | None:
|
|
1259
|
+
"""Format a note text into a JSON-serialized list for storage.
|
|
1260
|
+
|
|
1261
|
+
Args:
|
|
1262
|
+
note_text: The note text content to format.
|
|
1263
|
+
current_time_ms: The current timestamp in milliseconds.
|
|
1264
|
+
|
|
1265
|
+
Returns:
|
|
1266
|
+
list[str] | None: A list containing a single JSON string with note metadata
|
|
1267
|
+
(text, updated_by, updated_at), or None if note_text is NaN/missing.
|
|
1268
|
+
"""
|
|
1200
1269
|
if pd.isna(note_text):
|
|
1201
1270
|
return None
|
|
1202
1271
|
note_obj = {
|
|
@@ -1213,6 +1282,19 @@ def _log_flight_update_summary(
|
|
|
1213
1282
|
request_type: FlightRequestType,
|
|
1214
1283
|
response: FlightPostArrowFileResponse,
|
|
1215
1284
|
) -> None:
|
|
1285
|
+
"""Log a structured summary of Flight update results including metrics and errors.
|
|
1286
|
+
|
|
1287
|
+
Args:
|
|
1288
|
+
project_name: Name of the project being updated.
|
|
1289
|
+
total_spans: Total number of spans in the update request.
|
|
1290
|
+
request_type: The type of Flight request being performed.
|
|
1291
|
+
response: The Flight response object containing update results and errors.
|
|
1292
|
+
|
|
1293
|
+
Notes:
|
|
1294
|
+
Logs one summary line with aggregated metrics, plus individual error lines
|
|
1295
|
+
for any failed span updates. Metrics include success rate, spans processed,
|
|
1296
|
+
and failure counts.
|
|
1297
|
+
"""
|
|
1216
1298
|
spans_updated = getattr(response, "spans_updated", None)
|
|
1217
1299
|
if spans_updated is None:
|
|
1218
1300
|
# Fallback for older response types
|
|
@@ -1276,6 +1358,18 @@ def _message_to_dict(
|
|
|
1276
1358
|
preserve_names: bool = True,
|
|
1277
1359
|
use_int_enums: bool = False,
|
|
1278
1360
|
) -> dict[str, object]:
|
|
1361
|
+
"""Convert a protobuf Message to a dictionary representation.
|
|
1362
|
+
|
|
1363
|
+
Args:
|
|
1364
|
+
msg: The protobuf Message to convert.
|
|
1365
|
+
preserve_names: If True, preserve original proto field names. If False, use
|
|
1366
|
+
lowerCamelCase names. Defaults to True.
|
|
1367
|
+
use_int_enums: If True, represent enum values as integers. If False, use
|
|
1368
|
+
enum string names. Defaults to False.
|
|
1369
|
+
|
|
1370
|
+
Returns:
|
|
1371
|
+
dict[str, object]: Dictionary representation of the protobuf message.
|
|
1372
|
+
"""
|
|
1279
1373
|
return json_format.MessageToDict(
|
|
1280
1374
|
msg,
|
|
1281
1375
|
preserving_proto_field_name=preserve_names,
|
arize/spans/columns.py
CHANGED
|
@@ -39,8 +39,6 @@ class SpanColumn:
|
|
|
39
39
|
self.data_type = data_type
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
#
|
|
43
|
-
#
|
|
44
42
|
# Root level columns
|
|
45
43
|
SPAN_TRACE_ID_COL = SpanColumn(
|
|
46
44
|
name="context.trace_id",
|
|
@@ -96,18 +94,18 @@ SPAN_KIND_COL = SpanColumn(
|
|
|
96
94
|
data_type=SpanColumnDataType.STRING,
|
|
97
95
|
)
|
|
98
96
|
# Attributes Exception columns
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
97
|
+
SPAN_ATTRIBUTES_EXCEPTION_TYPE_COL = SpanColumn(
|
|
98
|
+
name=f"attributes.{otel.SpanAttributes.EXCEPTION_TYPE}",
|
|
99
|
+
data_type=SpanColumnDataType.STRING,
|
|
100
|
+
)
|
|
103
101
|
SPAN_ATTRIBUTES_EXCEPTION_MESSAGE_COL = SpanColumn(
|
|
104
102
|
name=f"attributes.{otel.SpanAttributes.EXCEPTION_MESSAGE}",
|
|
105
103
|
data_type=SpanColumnDataType.STRING,
|
|
106
104
|
)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
105
|
+
SPAN_ATTRIBUTES_EXCEPTION_ESCAPED_COL = SpanColumn(
|
|
106
|
+
name=f"attributes.{otel.SpanAttributes.EXCEPTION_ESCAPED}",
|
|
107
|
+
data_type=SpanColumnDataType.BOOL,
|
|
108
|
+
)
|
|
111
109
|
SPAN_ATTRIBUTES_EXCEPTION_STACKTRACE_COL = SpanColumn(
|
|
112
110
|
name=f"attributes.{otel.SpanAttributes.EXCEPTION_STACKTRACE}",
|
|
113
111
|
data_type=SpanColumnDataType.STRING,
|
|
@@ -176,20 +174,19 @@ SPAN_ATTRIBUTES_LLM_PROMPT_TEMPLATE_VERSION_COL = SpanColumn(
|
|
|
176
174
|
name=f"attributes.{oinf.SpanAttributes.LLM_PROMPT_TEMPLATE_VERSION}",
|
|
177
175
|
data_type=SpanColumnDataType.STRING,
|
|
178
176
|
)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
177
|
+
SPAN_ATTRIBUTES_LLM_PROMPT_TOKEN_COUNT_COL = SpanColumn(
|
|
178
|
+
name=f"attributes.{oinf.SpanAttributes.LLM_TOKEN_COUNT_PROMPT}",
|
|
179
|
+
data_type=SpanColumnDataType.NUMERIC,
|
|
180
|
+
)
|
|
181
|
+
SPAN_ATTRIBUTES_LLM_COMPLETION_TOKEN_COUNT_COL = SpanColumn(
|
|
182
|
+
name=f"attributes.{oinf.SpanAttributes.LLM_TOKEN_COUNT_COMPLETION}",
|
|
183
|
+
data_type=SpanColumnDataType.NUMERIC,
|
|
184
|
+
)
|
|
185
|
+
SPAN_ATTRIBUTES_LLM_TOTAL_TOKEN_COUNT_COL = SpanColumn(
|
|
186
|
+
name=f"attributes.{oinf.SpanAttributes.LLM_TOKEN_COUNT_TOTAL}",
|
|
187
|
+
data_type=SpanColumnDataType.NUMERIC,
|
|
188
|
+
)
|
|
191
189
|
# Attributes Message Keys
|
|
192
|
-
# SPAN_ATTRIBUTES_MESSAGE_NAME_KEY = f"{oinf.MessageAttributes.MESSAGE_NAME}"
|
|
193
190
|
SPAN_ATTRIBUTES_MESSAGE_ROLE_KEY = f"{oinf.MessageAttributes.MESSAGE_ROLE}"
|
|
194
191
|
SPAN_ATTRIBUTES_MESSAGE_CONTENT_KEY = (
|
|
195
192
|
f"{oinf.MessageAttributes.MESSAGE_CONTENT}"
|
|
@@ -223,7 +220,6 @@ SPAN_ATTRIBUTES_RETRIEVAL_DOCUMENTS_COL = SpanColumn(
|
|
|
223
220
|
)
|
|
224
221
|
# Document Object Keys
|
|
225
222
|
SPAN_ATTRIBUTES_DOCUMENT_ID_KEY = f"{oinf.DocumentAttributes.DOCUMENT_ID}"
|
|
226
|
-
# SPAN_ATTRIBUTES_DOCUMENT_SCORE_KEY = f"{oinf.DocumentAttributes.DOCUMENT_SCORE}"
|
|
227
223
|
SPAN_ATTRIBUTES_DOCUMENT_CONTENT_KEY = (
|
|
228
224
|
f"{oinf.DocumentAttributes.DOCUMENT_CONTENT}"
|
|
229
225
|
)
|
|
@@ -247,10 +243,10 @@ SPAN_ATTRIBUTES_RERANKER_MODEL_NAME_COL = SpanColumn(
|
|
|
247
243
|
name=f"attributes.{oinf.RerankerAttributes.RERANKER_MODEL_NAME}",
|
|
248
244
|
data_type=SpanColumnDataType.STRING,
|
|
249
245
|
)
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
246
|
+
SPAN_ATTRIBUTES_RERANKER_TOP_K_COL = SpanColumn(
|
|
247
|
+
name=f"attributes.{oinf.RerankerAttributes.RERANKER_TOP_K}",
|
|
248
|
+
data_type=SpanColumnDataType.NUMERIC,
|
|
249
|
+
)
|
|
254
250
|
SPAN_ATTRIBUTES_SESSION_ID = SpanColumn(
|
|
255
251
|
name=f"attributes.{oinf.SpanAttributes.SESSION_ID}",
|
|
256
252
|
data_type=SpanColumnDataType.STRING,
|
|
@@ -281,9 +277,9 @@ SPAN_OPENINFERENCE_COLUMNS = [
|
|
|
281
277
|
SPAN_STATUS_CODE_COL,
|
|
282
278
|
SPAN_STATUS_MESSAGE_COL,
|
|
283
279
|
SPAN_EVENTS_COL,
|
|
284
|
-
|
|
280
|
+
SPAN_ATTRIBUTES_EXCEPTION_TYPE_COL,
|
|
285
281
|
SPAN_ATTRIBUTES_EXCEPTION_MESSAGE_COL,
|
|
286
|
-
|
|
282
|
+
SPAN_ATTRIBUTES_EXCEPTION_ESCAPED_COL,
|
|
287
283
|
SPAN_ATTRIBUTES_EXCEPTION_STACKTRACE_COL,
|
|
288
284
|
SPAN_ATTRIBUTES_INPUT_VALUE_COL,
|
|
289
285
|
SPAN_ATTRIBUTES_INPUT_MIME_TYPE_COL,
|
|
@@ -297,9 +293,9 @@ SPAN_OPENINFERENCE_COLUMNS = [
|
|
|
297
293
|
SPAN_ATTRIBUTES_LLM_PROMPT_TEMPLATE_TEMPLATE_COL,
|
|
298
294
|
SPAN_ATTRIBUTES_LLM_PROMPT_TEMPLATE_VARIABLES_COL,
|
|
299
295
|
SPAN_ATTRIBUTES_LLM_PROMPT_TEMPLATE_VERSION_COL,
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
296
|
+
SPAN_ATTRIBUTES_LLM_PROMPT_TOKEN_COUNT_COL,
|
|
297
|
+
SPAN_ATTRIBUTES_LLM_COMPLETION_TOKEN_COUNT_COL,
|
|
298
|
+
SPAN_ATTRIBUTES_LLM_TOTAL_TOKEN_COUNT_COL,
|
|
303
299
|
SPAN_ATTRIBUTES_TOOL_NAME_COL,
|
|
304
300
|
SPAN_ATTRIBUTES_TOOL_DESCRIPTION_COL,
|
|
305
301
|
SPAN_ATTRIBUTES_TOOL_PARAMETERS_COL,
|
|
@@ -308,18 +304,18 @@ SPAN_OPENINFERENCE_COLUMNS = [
|
|
|
308
304
|
SPAN_ATTRIBUTES_RERANKER_OUTPUT_DOCUMENTS_COL,
|
|
309
305
|
SPAN_ATTRIBUTES_RERANKER_QUERY_COL,
|
|
310
306
|
SPAN_ATTRIBUTES_RERANKER_MODEL_NAME_COL,
|
|
311
|
-
|
|
307
|
+
SPAN_ATTRIBUTES_RERANKER_TOP_K_COL,
|
|
312
308
|
SPAN_ATTRIBUTES_SESSION_ID,
|
|
313
309
|
SPAN_ATTRIBUTES_USER_ID,
|
|
314
310
|
SPAN_ATTRIBUTES_METADATA,
|
|
315
311
|
SPAN_ATTRIBUTES_LLM_TOOLS_COL,
|
|
316
312
|
]
|
|
317
|
-
|
|
313
|
+
|
|
318
314
|
# List of columns that must be present in the dataframe
|
|
319
315
|
SPAN_OPENINFERENCE_REQUIRED_COLUMNS = [
|
|
320
316
|
col for col in SPAN_OPENINFERENCE_COLUMNS if col.required
|
|
321
317
|
]
|
|
322
|
-
|
|
318
|
+
|
|
323
319
|
# Eval columns
|
|
324
320
|
# EVAL_COLUMN_PREFIX = "eval."
|
|
325
321
|
# SESSION_EVAL_COLUMN_PREFIX = "session_eval."
|
arize/spans/conversion.py
CHANGED
|
@@ -7,22 +7,21 @@ from datetime import datetime, timezone
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import pandas as pd
|
|
9
9
|
|
|
10
|
-
# from arize.utils.logging import logger
|
|
11
10
|
from arize.spans.columns import SPAN_OPENINFERENCE_COLUMNS, SpanColumnDataType
|
|
12
11
|
|
|
13
12
|
|
|
14
13
|
def convert_timestamps(df: pd.DataFrame, fmt: str = "") -> pd.DataFrame:
|
|
15
|
-
"""Convert timestamp columns in a DataFrame to nanoseconds.
|
|
14
|
+
"""Convert timestamp columns in a :class:`pandas.DataFrame` to nanoseconds.
|
|
16
15
|
|
|
17
16
|
Args:
|
|
18
|
-
df: The pandas
|
|
17
|
+
df: The :class:`pandas.DataFrame` containing timestamp columns.
|
|
19
18
|
fmt: Optional datetime format string for parsing string timestamps. Defaults to "".
|
|
20
19
|
|
|
21
20
|
Returns:
|
|
22
|
-
The DataFrame with timestamp columns converted to nanoseconds.
|
|
21
|
+
The :class:`pandas.DataFrame` with timestamp columns converted to nanoseconds.
|
|
23
22
|
|
|
24
23
|
Raises:
|
|
25
|
-
KeyError: If required timestamp column is not found in DataFrame
|
|
24
|
+
KeyError: If required timestamp column is not found in :class:`pandas.DataFrame`.
|
|
26
25
|
"""
|
|
27
26
|
for col in SPAN_OPENINFERENCE_COLUMNS:
|
|
28
27
|
if col.data_type != SpanColumnDataType.TIMESTAMP:
|
|
@@ -70,7 +69,7 @@ def jsonify_dictionaries(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
70
69
|
"""Convert dictionary and list-of-dictionary columns to JSON strings.
|
|
71
70
|
|
|
72
71
|
Args:
|
|
73
|
-
df: The pandas
|
|
72
|
+
df: The :class:`pandas.DataFrame` containing dictionary columns.
|
|
74
73
|
|
|
75
74
|
Returns:
|
|
76
75
|
The DataFrame with dictionary columns converted to JSON strings.
|
|
@@ -121,17 +120,19 @@ def is_missing_value(value: object) -> bool:
|
|
|
121
120
|
np.inf,
|
|
122
121
|
-np.inf,
|
|
123
122
|
)
|
|
124
|
-
return value in assumed_missing_values or pd.isna(value)
|
|
123
|
+
return value in assumed_missing_values or pd.isna(value) # type: ignore[call-overload]
|
|
125
124
|
|
|
126
125
|
|
|
127
126
|
def _jsonify_list_of_dicts(
|
|
128
127
|
list_of_dicts: Iterable[dict[str, object]] | None,
|
|
129
128
|
) -> list[str]:
|
|
130
|
-
if
|
|
131
|
-
list_of_dicts
|
|
132
|
-
):
|
|
129
|
+
if list_of_dicts is None or is_missing_value(list_of_dicts):
|
|
133
130
|
return []
|
|
134
|
-
return [
|
|
131
|
+
return [
|
|
132
|
+
result
|
|
133
|
+
for d in list_of_dicts
|
|
134
|
+
if (result := _jsonify_dict(d)) is not None
|
|
135
|
+
]
|
|
135
136
|
|
|
136
137
|
|
|
137
138
|
def _jsonify_dict(d: dict[str, object] | None) -> str | None:
|
|
@@ -65,7 +65,7 @@ def check_invalid_annotation_column_names(
|
|
|
65
65
|
df: pd.DataFrame,
|
|
66
66
|
) -> list[ValidationError]:
|
|
67
67
|
"""Checks for columns that start with 'annotation.' but don't match the expected pattern."""
|
|
68
|
-
errors = []
|
|
68
|
+
errors: list[ValidationError] = []
|
|
69
69
|
|
|
70
70
|
invalid_annotation_columns = [
|
|
71
71
|
col
|
|
@@ -78,7 +78,7 @@ def check_annotation_updated_at_timestamp(
|
|
|
78
78
|
df: pd.DataFrame,
|
|
79
79
|
col_name: str,
|
|
80
80
|
is_required: bool,
|
|
81
|
-
) -> list[
|
|
81
|
+
) -> list[ValidationError]:
|
|
82
82
|
"""Validates annotation timestamp values for validity and acceptable ranges.
|
|
83
83
|
|
|
84
84
|
Checks that timestamp values are positive, not in the future, and satisfy
|
|
@@ -96,7 +96,7 @@ def check_annotation_updated_at_timestamp(
|
|
|
96
96
|
if col_name not in df.columns:
|
|
97
97
|
return []
|
|
98
98
|
|
|
99
|
-
errors = []
|
|
99
|
+
errors: list[ValidationError] = []
|
|
100
100
|
if is_required and df[col_name].isnull().any():
|
|
101
101
|
errors.append(
|
|
102
102
|
InvalidMissingValueInColumn(
|
|
@@ -131,7 +131,7 @@ def check_annotation_cols(
|
|
|
131
131
|
dataframe: pd.DataFrame,
|
|
132
132
|
) -> list[ValidationError]:
|
|
133
133
|
"""Checks value length and validity for columns matching annotation patterns."""
|
|
134
|
-
checks = []
|
|
134
|
+
checks: list[list[ValidationError]] = []
|
|
135
135
|
for col in dataframe.columns:
|
|
136
136
|
if col.endswith(ANNOTATION_LABEL_SUFFIX):
|
|
137
137
|
checks.append(
|
|
@@ -140,7 +140,8 @@ def check_annotation_cols(
|
|
|
140
140
|
col_name=col,
|
|
141
141
|
min_len=ANNOTATION_LABEL_MIN_STR_LENGTH,
|
|
142
142
|
max_len=ANNOTATION_LABEL_MAX_STR_LENGTH,
|
|
143
|
-
|
|
143
|
+
# Individual columns are not required
|
|
144
|
+
is_required=False,
|
|
144
145
|
)
|
|
145
146
|
)
|
|
146
147
|
elif col.endswith(ANNOTATION_SCORE_SUFFIX):
|
|
@@ -231,15 +232,11 @@ def check_annotation_notes_column(
|
|
|
231
232
|
col_name = ANNOTATION_NOTES_COLUMN_NAME
|
|
232
233
|
if col_name in dataframe.columns:
|
|
233
234
|
# Validate the length of the raw string
|
|
234
|
-
return
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
max_len=ANNOTATION_NOTES_MAX_STR_LENGTH,
|
|
241
|
-
is_required=False,
|
|
242
|
-
)
|
|
243
|
-
)
|
|
235
|
+
return common_value_validation.check_string_column_value_length(
|
|
236
|
+
df=dataframe,
|
|
237
|
+
col_name=col_name,
|
|
238
|
+
min_len=0, # Allow empty notes
|
|
239
|
+
max_len=ANNOTATION_NOTES_MAX_STR_LENGTH,
|
|
240
|
+
is_required=False,
|
|
244
241
|
)
|
|
245
242
|
return []
|
|
@@ -39,13 +39,13 @@ def check_field_convertible_to_str(
|
|
|
39
39
|
def check_dataframe_type(
|
|
40
40
|
dataframe: object,
|
|
41
41
|
) -> list[InvalidTypeArgument]:
|
|
42
|
-
"""Validates that the provided argument is a pandas
|
|
42
|
+
"""Validates that the provided argument is a :class:`pandas.DataFrame`.
|
|
43
43
|
|
|
44
44
|
Args:
|
|
45
|
-
dataframe: The object to validate as a pandas
|
|
45
|
+
dataframe: The object to validate as a :class:`pandas.DataFrame`.
|
|
46
46
|
|
|
47
47
|
Returns:
|
|
48
|
-
List of validation errors if not a DataFrame (empty if valid).
|
|
48
|
+
List of validation errors if not a :class:`pandas.DataFrame` (empty if valid).
|
|
49
49
|
"""
|
|
50
50
|
if not isinstance(dataframe, pd.DataFrame):
|
|
51
51
|
return [
|
|
@@ -17,10 +17,10 @@ if TYPE_CHECKING:
|
|
|
17
17
|
def check_dataframe_index(
|
|
18
18
|
dataframe: pd.DataFrame,
|
|
19
19
|
) -> list[InvalidDataFrameIndex]:
|
|
20
|
-
"""Validates that the DataFrame has a default integer index.
|
|
20
|
+
"""Validates that the :class:`pandas.DataFrame` has a default integer index.
|
|
21
21
|
|
|
22
22
|
Args:
|
|
23
|
-
dataframe: The DataFrame to validate.
|
|
23
|
+
dataframe: The :class:`pandas.DataFrame` to validate.
|
|
24
24
|
|
|
25
25
|
Returns:
|
|
26
26
|
List of validation errors if index is not default (empty if valid).
|
|
@@ -34,10 +34,10 @@ def check_dataframe_required_column_set(
|
|
|
34
34
|
df: pd.DataFrame,
|
|
35
35
|
required_columns: list[str],
|
|
36
36
|
) -> list[InvalidDataFrameMissingColumns]:
|
|
37
|
-
"""Validates that the DataFrame contains all required columns.
|
|
37
|
+
"""Validates that the :class:`pandas.DataFrame` contains all required columns.
|
|
38
38
|
|
|
39
39
|
Args:
|
|
40
|
-
df: The DataFrame to validate.
|
|
40
|
+
df: The :class:`pandas.DataFrame` to validate.
|
|
41
41
|
required_columns: List of column names that must be present.
|
|
42
42
|
|
|
43
43
|
Returns:
|
|
@@ -56,10 +56,10 @@ def check_dataframe_required_column_set(
|
|
|
56
56
|
def check_dataframe_for_duplicate_columns(
|
|
57
57
|
df: pd.DataFrame,
|
|
58
58
|
) -> list[InvalidDataFrameDuplicateColumns]:
|
|
59
|
-
"""Validates that the DataFrame has no duplicate column names.
|
|
59
|
+
"""Validates that the :class:`pandas.DataFrame` has no duplicate column names.
|
|
60
60
|
|
|
61
61
|
Args:
|
|
62
|
-
df: The DataFrame to validate.
|
|
62
|
+
df: The :class:`pandas.DataFrame` to validate.
|
|
63
63
|
|
|
64
64
|
Returns:
|
|
65
65
|
List of validation errors if duplicate columns exist (empty if valid).
|
|
@@ -67,5 +67,5 @@ def check_dataframe_for_duplicate_columns(
|
|
|
67
67
|
# Get the duplicated column names from the dataframe
|
|
68
68
|
duplicate_columns = df.columns[df.columns.duplicated()]
|
|
69
69
|
if not duplicate_columns.empty:
|
|
70
|
-
return [InvalidDataFrameDuplicateColumns(duplicate_columns)]
|
|
70
|
+
return [InvalidDataFrameDuplicateColumns(duplicate_columns.tolist())]
|
|
71
71
|
return []
|