arize 8.0.0a22__py3-none-any.whl → 8.0.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +28 -19
- arize/_exporter/client.py +56 -37
- arize/_exporter/parsers/tracing_data_parser.py +41 -30
- arize/_exporter/validation.py +3 -3
- arize/_flight/client.py +207 -76
- arize/_generated/api_client/__init__.py +30 -6
- arize/_generated/api_client/api/__init__.py +1 -0
- arize/_generated/api_client/api/datasets_api.py +864 -190
- arize/_generated/api_client/api/experiments_api.py +167 -131
- arize/_generated/api_client/api/projects_api.py +1197 -0
- arize/_generated/api_client/api_client.py +2 -2
- arize/_generated/api_client/configuration.py +42 -34
- arize/_generated/api_client/exceptions.py +2 -2
- arize/_generated/api_client/models/__init__.py +15 -4
- arize/_generated/api_client/models/dataset.py +10 -10
- arize/_generated/api_client/models/dataset_example.py +111 -0
- arize/_generated/api_client/models/dataset_example_update.py +100 -0
- arize/_generated/api_client/models/dataset_version.py +13 -13
- arize/_generated/api_client/models/datasets_create_request.py +16 -8
- arize/_generated/api_client/models/datasets_examples_insert_request.py +100 -0
- arize/_generated/api_client/models/datasets_examples_list200_response.py +106 -0
- arize/_generated/api_client/models/datasets_examples_update_request.py +102 -0
- arize/_generated/api_client/models/datasets_list200_response.py +10 -4
- arize/_generated/api_client/models/experiment.py +14 -16
- arize/_generated/api_client/models/experiment_run.py +108 -0
- arize/_generated/api_client/models/experiment_run_create.py +102 -0
- arize/_generated/api_client/models/experiments_create_request.py +16 -10
- arize/_generated/api_client/models/experiments_list200_response.py +10 -4
- arize/_generated/api_client/models/experiments_runs_list200_response.py +19 -5
- arize/_generated/api_client/models/{error.py → pagination_metadata.py} +13 -11
- arize/_generated/api_client/models/primitive_value.py +172 -0
- arize/_generated/api_client/models/problem.py +100 -0
- arize/_generated/api_client/models/project.py +99 -0
- arize/_generated/api_client/models/{datasets_list_examples200_response.py → projects_create_request.py} +13 -11
- arize/_generated/api_client/models/projects_list200_response.py +106 -0
- arize/_generated/api_client/rest.py +2 -2
- arize/_generated/api_client/test/test_dataset.py +4 -2
- arize/_generated/api_client/test/test_dataset_example.py +56 -0
- arize/_generated/api_client/test/test_dataset_example_update.py +52 -0
- arize/_generated/api_client/test/test_dataset_version.py +7 -2
- arize/_generated/api_client/test/test_datasets_api.py +27 -13
- arize/_generated/api_client/test/test_datasets_create_request.py +8 -4
- arize/_generated/api_client/test/{test_datasets_list_examples200_response.py → test_datasets_examples_insert_request.py} +19 -15
- arize/_generated/api_client/test/test_datasets_examples_list200_response.py +66 -0
- arize/_generated/api_client/test/test_datasets_examples_update_request.py +61 -0
- arize/_generated/api_client/test/test_datasets_list200_response.py +9 -3
- arize/_generated/api_client/test/test_experiment.py +2 -4
- arize/_generated/api_client/test/test_experiment_run.py +56 -0
- arize/_generated/api_client/test/test_experiment_run_create.py +54 -0
- arize/_generated/api_client/test/test_experiments_api.py +6 -6
- arize/_generated/api_client/test/test_experiments_create_request.py +9 -6
- arize/_generated/api_client/test/test_experiments_list200_response.py +9 -5
- arize/_generated/api_client/test/test_experiments_runs_list200_response.py +15 -5
- arize/_generated/api_client/test/test_pagination_metadata.py +53 -0
- arize/_generated/api_client/test/{test_error.py → test_primitive_value.py} +13 -14
- arize/_generated/api_client/test/test_problem.py +57 -0
- arize/_generated/api_client/test/test_project.py +58 -0
- arize/_generated/api_client/test/test_projects_api.py +59 -0
- arize/_generated/api_client/test/test_projects_create_request.py +54 -0
- arize/_generated/api_client/test/test_projects_list200_response.py +70 -0
- arize/_generated/api_client_README.md +43 -29
- arize/_generated/protocol/flight/flight_pb2.py +400 -0
- arize/_lazy.py +27 -19
- arize/client.py +181 -58
- arize/config.py +324 -116
- arize/constants/__init__.py +1 -0
- arize/constants/config.py +11 -4
- arize/constants/ml.py +6 -4
- arize/constants/openinference.py +2 -0
- arize/constants/pyarrow.py +2 -0
- arize/constants/spans.py +3 -1
- arize/datasets/__init__.py +1 -0
- arize/datasets/client.py +304 -84
- arize/datasets/errors.py +32 -2
- arize/datasets/validation.py +18 -8
- arize/embeddings/__init__.py +2 -0
- arize/embeddings/auto_generator.py +23 -19
- arize/embeddings/base_generators.py +89 -36
- arize/embeddings/constants.py +2 -0
- arize/embeddings/cv_generators.py +26 -4
- arize/embeddings/errors.py +27 -5
- arize/embeddings/nlp_generators.py +43 -18
- arize/embeddings/tabular_generators.py +46 -31
- arize/embeddings/usecases.py +12 -2
- arize/exceptions/__init__.py +1 -0
- arize/exceptions/auth.py +11 -1
- arize/exceptions/base.py +29 -4
- arize/exceptions/models.py +21 -2
- arize/exceptions/parameters.py +31 -0
- arize/exceptions/spaces.py +12 -1
- arize/exceptions/types.py +86 -7
- arize/exceptions/values.py +220 -20
- arize/experiments/__init__.py +13 -0
- arize/experiments/client.py +394 -285
- arize/experiments/evaluators/__init__.py +1 -0
- arize/experiments/evaluators/base.py +74 -41
- arize/experiments/evaluators/exceptions.py +6 -3
- arize/experiments/evaluators/executors.py +121 -73
- arize/experiments/evaluators/rate_limiters.py +106 -57
- arize/experiments/evaluators/types.py +34 -7
- arize/experiments/evaluators/utils.py +65 -27
- arize/experiments/functions.py +103 -101
- arize/experiments/tracing.py +52 -44
- arize/experiments/types.py +56 -31
- arize/logging.py +54 -22
- arize/ml/__init__.py +1 -0
- arize/ml/batch_validation/__init__.py +1 -0
- arize/{models → ml}/batch_validation/errors.py +545 -67
- arize/{models → ml}/batch_validation/validator.py +344 -303
- arize/ml/bounded_executor.py +47 -0
- arize/{models → ml}/casting.py +118 -108
- arize/{models → ml}/client.py +339 -118
- arize/{models → ml}/proto.py +97 -42
- arize/{models → ml}/stream_validation.py +43 -15
- arize/ml/surrogate_explainer/__init__.py +1 -0
- arize/{models → ml}/surrogate_explainer/mimic.py +25 -10
- arize/{types.py → ml/types.py} +355 -354
- arize/pre_releases.py +44 -0
- arize/projects/__init__.py +1 -0
- arize/projects/client.py +134 -0
- arize/regions.py +40 -0
- arize/spans/__init__.py +1 -0
- arize/spans/client.py +204 -175
- arize/spans/columns.py +13 -0
- arize/spans/conversion.py +60 -37
- arize/spans/validation/__init__.py +1 -0
- arize/spans/validation/annotations/__init__.py +1 -0
- arize/spans/validation/annotations/annotations_validation.py +6 -4
- arize/spans/validation/annotations/dataframe_form_validation.py +13 -11
- arize/spans/validation/annotations/value_validation.py +35 -11
- arize/spans/validation/common/__init__.py +1 -0
- arize/spans/validation/common/argument_validation.py +33 -8
- arize/spans/validation/common/dataframe_form_validation.py +35 -9
- arize/spans/validation/common/errors.py +211 -11
- arize/spans/validation/common/value_validation.py +81 -14
- arize/spans/validation/evals/__init__.py +1 -0
- arize/spans/validation/evals/dataframe_form_validation.py +28 -8
- arize/spans/validation/evals/evals_validation.py +34 -4
- arize/spans/validation/evals/value_validation.py +26 -3
- arize/spans/validation/metadata/__init__.py +1 -1
- arize/spans/validation/metadata/argument_validation.py +14 -5
- arize/spans/validation/metadata/dataframe_form_validation.py +26 -10
- arize/spans/validation/metadata/value_validation.py +24 -10
- arize/spans/validation/spans/__init__.py +1 -0
- arize/spans/validation/spans/dataframe_form_validation.py +35 -14
- arize/spans/validation/spans/spans_validation.py +35 -4
- arize/spans/validation/spans/value_validation.py +78 -8
- arize/utils/__init__.py +1 -0
- arize/utils/arrow.py +31 -15
- arize/utils/cache.py +34 -6
- arize/utils/dataframe.py +20 -3
- arize/utils/online_tasks/__init__.py +2 -0
- arize/utils/online_tasks/dataframe_preprocessor.py +58 -47
- arize/utils/openinference_conversion.py +44 -5
- arize/utils/proto.py +10 -0
- arize/utils/size.py +5 -3
- arize/utils/types.py +105 -0
- arize/version.py +3 -1
- {arize-8.0.0a22.dist-info → arize-8.0.0b0.dist-info}/METADATA +13 -6
- arize-8.0.0b0.dist-info/RECORD +175 -0
- {arize-8.0.0a22.dist-info → arize-8.0.0b0.dist-info}/WHEEL +1 -1
- arize-8.0.0b0.dist-info/licenses/LICENSE +176 -0
- arize-8.0.0b0.dist-info/licenses/NOTICE +13 -0
- arize/_generated/protocol/flight/export_pb2.py +0 -61
- arize/_generated/protocol/flight/ingest_pb2.py +0 -365
- arize/models/__init__.py +0 -0
- arize/models/batch_validation/__init__.py +0 -0
- arize/models/bounded_executor.py +0 -34
- arize/models/surrogate_explainer/__init__.py +0 -0
- arize-8.0.0a22.dist-info/RECORD +0 -146
- arize-8.0.0a22.dist-info/licenses/LICENSE.md +0 -12
arize/spans/columns.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Span column definitions and OpenInference semantic conventions."""
|
|
2
|
+
|
|
1
3
|
from enum import Enum
|
|
2
4
|
|
|
3
5
|
import openinference.semconv.trace as oinf
|
|
@@ -5,6 +7,8 @@ import opentelemetry.semconv.trace as otel
|
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
class SpanColumnDataType(Enum):
|
|
10
|
+
"""Enum representing supported data types for span columns."""
|
|
11
|
+
|
|
8
12
|
BOOL = 1
|
|
9
13
|
NUMERIC = 2
|
|
10
14
|
STRING = 3
|
|
@@ -15,12 +19,21 @@ class SpanColumnDataType(Enum):
|
|
|
15
19
|
|
|
16
20
|
|
|
17
21
|
class SpanColumn:
|
|
22
|
+
"""Configuration for a custom span column with name, data type, and annotation settings."""
|
|
23
|
+
|
|
18
24
|
def __init__(
|
|
19
25
|
self,
|
|
20
26
|
name: str,
|
|
21
27
|
data_type: SpanColumnDataType,
|
|
22
28
|
required: bool = False,
|
|
23
29
|
) -> None:
|
|
30
|
+
"""Initialize a span column configuration.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
name: Name of the span column.
|
|
34
|
+
data_type: Data type of the column values.
|
|
35
|
+
required: Whether the column is required.
|
|
36
|
+
"""
|
|
24
37
|
self.name = name
|
|
25
38
|
self.required = required
|
|
26
39
|
self.data_type = data_type
|
arize/spans/conversion.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
"""Span data conversion utilities for transforming and normalizing span data."""
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
|
-
from
|
|
3
|
-
from
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from datetime import datetime, timezone
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
import pandas as pd
|
|
@@ -10,53 +12,69 @@ from arize.spans.columns import SPAN_OPENINFERENCE_COLUMNS, SpanColumnDataType
|
|
|
10
12
|
|
|
11
13
|
|
|
12
14
|
def convert_timestamps(df: pd.DataFrame, fmt: str = "") -> pd.DataFrame:
|
|
15
|
+
"""Convert timestamp columns in a DataFrame to nanoseconds.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
df: The pandas DataFrame containing timestamp columns.
|
|
19
|
+
fmt: Optional datetime format string for parsing string timestamps. Defaults to "".
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
The DataFrame with timestamp columns converted to nanoseconds.
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
KeyError: If required timestamp column is not found in DataFrame.
|
|
26
|
+
"""
|
|
13
27
|
for col in SPAN_OPENINFERENCE_COLUMNS:
|
|
14
28
|
if col.data_type != SpanColumnDataType.TIMESTAMP:
|
|
15
29
|
continue
|
|
30
|
+
if col.name not in df.columns:
|
|
31
|
+
raise KeyError(f"Column '{col.name}' not found in DataFrame")
|
|
16
32
|
df[col.name] = df[col.name].apply(lambda dt: _datetime_to_ns(dt, fmt))
|
|
17
33
|
return df
|
|
18
34
|
|
|
19
35
|
|
|
20
36
|
def _datetime_to_ns(dt: object, fmt: str) -> int:
|
|
21
37
|
if isinstance(dt, str):
|
|
38
|
+
# Try ISO 8601 with timezone first
|
|
22
39
|
try:
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
#
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# logger.error(
|
|
36
|
-
# f"Error converting datetime object to nanoseconds: {e}"
|
|
37
|
-
# )
|
|
38
|
-
raise e
|
|
39
|
-
return ts
|
|
40
|
-
elif isinstance(dt, pd.Timestamp):
|
|
40
|
+
parsed = datetime.fromisoformat(dt)
|
|
41
|
+
if parsed.tzinfo is None:
|
|
42
|
+
# If no timezone, assume UTC
|
|
43
|
+
parsed = parsed.replace(tzinfo=timezone.utc)
|
|
44
|
+
except ValueError:
|
|
45
|
+
# Fall back to custom format
|
|
46
|
+
parsed = datetime.strptime(dt, fmt).replace(tzinfo=timezone.utc)
|
|
47
|
+
|
|
48
|
+
return int(parsed.timestamp() * 1e9)
|
|
49
|
+
if isinstance(dt, datetime):
|
|
50
|
+
return int(datetime.timestamp(dt) * 1e9)
|
|
51
|
+
if isinstance(dt, pd.Timestamp):
|
|
41
52
|
return int(dt.value)
|
|
42
|
-
|
|
53
|
+
if isinstance(dt, pd.DatetimeIndex):
|
|
43
54
|
# Only allow a single element; otherwise ambiguous for a scalar function
|
|
44
55
|
if len(dt) != 1:
|
|
45
56
|
raise TypeError(
|
|
46
57
|
f"Expected a single timestamp in DatetimeIndex, got length={len(dt)}"
|
|
47
58
|
)
|
|
48
59
|
return int(dt.to_numpy(dtype="datetime64[ns]").astype("int64")[0])
|
|
49
|
-
|
|
60
|
+
if isinstance(dt, (int, float)):
|
|
50
61
|
# Assume value already in nanoseconds,
|
|
51
62
|
# validate timestamps in validate_values
|
|
52
63
|
return int(dt)
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
raise e
|
|
64
|
+
e = TypeError(f"Cannot convert type {type(dt)} to nanoseconds")
|
|
65
|
+
# logger.error(f"Error converting pandas Timestamp to nanoseconds: {e}")
|
|
66
|
+
raise e
|
|
57
67
|
|
|
58
68
|
|
|
59
69
|
def jsonify_dictionaries(df: pd.DataFrame) -> pd.DataFrame:
|
|
70
|
+
"""Convert dictionary and list-of-dictionary columns to JSON strings.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
df: The pandas DataFrame containing dictionary columns.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
The DataFrame with dictionary columns converted to JSON strings.
|
|
77
|
+
"""
|
|
60
78
|
# NOTE: numpy arrays are not json serializable. Hence, we assume the
|
|
61
79
|
# embeddings come as lists, not arrays
|
|
62
80
|
dict_cols = [
|
|
@@ -90,7 +108,15 @@ def jsonify_dictionaries(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
90
108
|
|
|
91
109
|
|
|
92
110
|
# Defines what is considered a missing value
|
|
93
|
-
def
|
|
111
|
+
def is_missing_value(value: object) -> bool:
|
|
112
|
+
"""Check if a value should be considered missing or invalid.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
value: The value to check.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
True if the value is missing (NaN, infinity, or pandas NA), False otherwise.
|
|
119
|
+
"""
|
|
94
120
|
assumed_missing_values = (
|
|
95
121
|
np.inf,
|
|
96
122
|
-np.inf,
|
|
@@ -99,22 +125,19 @@ def isMissingValue(value: Any) -> bool:
|
|
|
99
125
|
|
|
100
126
|
|
|
101
127
|
def _jsonify_list_of_dicts(
|
|
102
|
-
list_of_dicts: Iterable[
|
|
103
|
-
) ->
|
|
104
|
-
if not isinstance(list_of_dicts, Iterable) and
|
|
128
|
+
list_of_dicts: Iterable[dict[str, object]] | None,
|
|
129
|
+
) -> list[str]:
|
|
130
|
+
if not isinstance(list_of_dicts, Iterable) and is_missing_value(
|
|
105
131
|
list_of_dicts
|
|
106
132
|
):
|
|
107
133
|
return []
|
|
108
|
-
|
|
109
|
-
for d in list_of_dicts:
|
|
110
|
-
list_of_json.append(_jsonify_dict(d))
|
|
111
|
-
return list_of_json
|
|
134
|
+
return [_jsonify_dict(d) for d in list_of_dicts]
|
|
112
135
|
|
|
113
136
|
|
|
114
|
-
def _jsonify_dict(d:
|
|
137
|
+
def _jsonify_dict(d: dict[str, object] | None) -> str | None:
|
|
115
138
|
if d is None:
|
|
116
|
-
return
|
|
117
|
-
if
|
|
139
|
+
return None
|
|
140
|
+
if is_missing_value(d):
|
|
118
141
|
return None
|
|
119
142
|
d = d.copy() # avoid side effects
|
|
120
143
|
for k, v in d.items():
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Validation utilities for LLM tracing spans data."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Annotation validation for LLM tracing spans."""
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
"""Annotation validation orchestration for spans."""
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
5
|
from itertools import chain
|
|
4
|
-
from typing import TYPE_CHECKING
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
5
7
|
|
|
6
8
|
from arize.spans.columns import SPAN_SPAN_ID_COL
|
|
7
9
|
from arize.spans.validation.annotations import (
|
|
@@ -27,7 +29,7 @@ if TYPE_CHECKING:
|
|
|
27
29
|
def validate_argument_types(
|
|
28
30
|
annotations_dataframe: pd.DataFrame,
|
|
29
31
|
project_name: str,
|
|
30
|
-
) ->
|
|
32
|
+
) -> list[ValidationError]:
|
|
31
33
|
"""Validates argument types for log_annotations."""
|
|
32
34
|
checks = chain(
|
|
33
35
|
common_arg_validation.check_field_convertible_to_str(project_name),
|
|
@@ -40,7 +42,7 @@ def validate_argument_types(
|
|
|
40
42
|
|
|
41
43
|
def validate_dataframe_form(
|
|
42
44
|
annotations_dataframe: pd.DataFrame,
|
|
43
|
-
) ->
|
|
45
|
+
) -> list[ValidationError]:
|
|
44
46
|
"""Validates the form/structure of the annotation dataframe."""
|
|
45
47
|
# Call annotation-specific function (to be created)
|
|
46
48
|
df_validation.log_info_dataframe_extra_column_names(annotations_dataframe)
|
|
@@ -64,7 +66,7 @@ def validate_dataframe_form(
|
|
|
64
66
|
def validate_values(
|
|
65
67
|
annotations_dataframe: pd.DataFrame,
|
|
66
68
|
project_name: str,
|
|
67
|
-
) ->
|
|
69
|
+
) -> list[ValidationError]:
|
|
68
70
|
"""Validates the values within the annotation dataframe."""
|
|
69
71
|
checks = chain(
|
|
70
72
|
# Common checks remain the same
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
+
"""DataFrame form validation for span annotations."""
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
5
|
import logging
|
|
4
6
|
import re
|
|
5
|
-
from typing import TYPE_CHECKING
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
6
8
|
|
|
7
9
|
import pandas as pd
|
|
8
10
|
|
|
@@ -19,7 +21,7 @@ from arize.spans.columns import (
|
|
|
19
21
|
ANNOTATION_UPDATED_BY_SUFFIX,
|
|
20
22
|
SPAN_SPAN_ID_COL,
|
|
21
23
|
)
|
|
22
|
-
from arize.spans.conversion import
|
|
24
|
+
from arize.spans.conversion import is_missing_value
|
|
23
25
|
from arize.spans.validation.common.errors import (
|
|
24
26
|
InvalidAnnotationColumnFormat,
|
|
25
27
|
InvalidDataFrameColumnContentTypes,
|
|
@@ -36,7 +38,7 @@ def log_info_dataframe_extra_column_names(
|
|
|
36
38
|
) -> None:
|
|
37
39
|
"""Logs columns that don't match expected annotation or context patterns."""
|
|
38
40
|
if df is None:
|
|
39
|
-
return
|
|
41
|
+
return
|
|
40
42
|
# Check against annotation pattern, span id, and note column
|
|
41
43
|
irrelevant_columns = [
|
|
42
44
|
col
|
|
@@ -56,12 +58,12 @@ def log_info_dataframe_extra_column_names(
|
|
|
56
58
|
"- annotation.<your-annotation-name>.score"
|
|
57
59
|
f"An optional '{ANNOTATION_NOTES_COLUMN_NAME}' column can also be included."
|
|
58
60
|
)
|
|
59
|
-
return
|
|
61
|
+
return
|
|
60
62
|
|
|
61
63
|
|
|
62
64
|
def check_invalid_annotation_column_names(
|
|
63
65
|
df: pd.DataFrame,
|
|
64
|
-
) ->
|
|
66
|
+
) -> list[ValidationError]:
|
|
65
67
|
"""Checks for columns that start with 'annotation.' but don't match the expected pattern."""
|
|
66
68
|
errors = []
|
|
67
69
|
|
|
@@ -86,7 +88,7 @@ def check_invalid_annotation_column_names(
|
|
|
86
88
|
|
|
87
89
|
def check_dataframe_column_content_type(
|
|
88
90
|
df: pd.DataFrame,
|
|
89
|
-
) ->
|
|
91
|
+
) -> list[ValidationError]:
|
|
90
92
|
"""Checks that columns matching annotation patterns have the correct data types."""
|
|
91
93
|
wrong_labels_cols = []
|
|
92
94
|
wrong_scores_cols = []
|
|
@@ -128,14 +130,14 @@ def check_dataframe_column_content_type(
|
|
|
128
130
|
# Check annotation label column type (string or missing)
|
|
129
131
|
elif annotation_label_re.match(column):
|
|
130
132
|
if not all(
|
|
131
|
-
isinstance(value, str) or
|
|
133
|
+
isinstance(value, str) or is_missing_value(value)
|
|
132
134
|
for value in df[column]
|
|
133
135
|
):
|
|
134
136
|
wrong_labels_cols.append(column)
|
|
135
137
|
# Check annotation score column type (numeric or missing)
|
|
136
138
|
elif annotation_score_re.match(column):
|
|
137
139
|
if not all(
|
|
138
|
-
isinstance(value, (int, float)) or
|
|
140
|
+
isinstance(value, (int, float)) or is_missing_value(value)
|
|
139
141
|
for value in df[column]
|
|
140
142
|
):
|
|
141
143
|
wrong_scores_cols.append(column)
|
|
@@ -144,21 +146,21 @@ def check_dataframe_column_content_type(
|
|
|
144
146
|
if not all(
|
|
145
147
|
# Note: After formatting, this column holds list<string> (JSON), not just string.
|
|
146
148
|
# We rely on later schema inference/validation. Keep basic check for now.
|
|
147
|
-
isinstance(value, list) or
|
|
149
|
+
isinstance(value, list) or is_missing_value(value)
|
|
148
150
|
for value in df[column]
|
|
149
151
|
):
|
|
150
152
|
wrong_notes_cols.append(column)
|
|
151
153
|
# Check annotation updated_by column type (string or missing)
|
|
152
154
|
elif annotation_updated_by_re.match(column):
|
|
153
155
|
if not all(
|
|
154
|
-
isinstance(value, str) or
|
|
156
|
+
isinstance(value, str) or is_missing_value(value)
|
|
155
157
|
for value in df[column]
|
|
156
158
|
):
|
|
157
159
|
wrong_updated_by_cols.append(column)
|
|
158
160
|
# Check annotation updated_at column type (numeric or missing)
|
|
159
161
|
elif annotation_updated_at_re.match(column) and not all(
|
|
160
162
|
# Allow int, float (e.g., Unix timestamp millis)
|
|
161
|
-
isinstance(value, (int, float)) or
|
|
163
|
+
isinstance(value, (int, float)) or is_missing_value(value)
|
|
162
164
|
for value in df[column]
|
|
163
165
|
):
|
|
164
166
|
wrong_updated_at_cols.append(column)
|
|
@@ -1,10 +1,12 @@
|
|
|
1
|
+
"""Value validation logic for span annotation data."""
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
5
|
import logging
|
|
4
6
|
import re
|
|
5
|
-
from datetime import datetime
|
|
7
|
+
from datetime import datetime, timezone
|
|
6
8
|
from itertools import chain
|
|
7
|
-
from typing import TYPE_CHECKING
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
8
10
|
|
|
9
11
|
from arize.constants.spans import (
|
|
10
12
|
ANNOTATION_LABEL_MAX_STR_LENGTH,
|
|
@@ -41,33 +43,55 @@ logger = logging.getLogger(__name__)
|
|
|
41
43
|
|
|
42
44
|
|
|
43
45
|
class InvalidAnnotationTimestamp(ValidationError):
|
|
46
|
+
"""Raised when annotation timestamp is invalid or out of acceptable range."""
|
|
47
|
+
|
|
44
48
|
def __repr__(self) -> str:
|
|
49
|
+
"""Return a string representation for debugging and logging."""
|
|
45
50
|
return "Invalid_Annotation_Timestamp"
|
|
46
51
|
|
|
47
52
|
def __init__(self, timestamp_col_name: str, error_type: str) -> None:
|
|
53
|
+
"""Initialize the exception with timestamp validation context.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
timestamp_col_name: Name of the annotation timestamp column.
|
|
57
|
+
error_type: Type of timestamp error (e.g., 'future').
|
|
58
|
+
"""
|
|
48
59
|
self.timestamp_col_name = timestamp_col_name
|
|
49
60
|
self.error_type = error_type
|
|
50
61
|
|
|
51
62
|
def error_message(self) -> str:
|
|
63
|
+
"""Return the error message for this exception."""
|
|
52
64
|
if self.error_type == "future":
|
|
53
65
|
return (
|
|
54
66
|
f"At least one timestamp in the annotation column '{self.timestamp_col_name}' "
|
|
55
67
|
f"is in the future. Annotation timestamps cannot be in the future."
|
|
56
68
|
)
|
|
57
|
-
|
|
69
|
+
if self.error_type == "non_positive":
|
|
58
70
|
return (
|
|
59
71
|
f"At least one timestamp in the annotation column '{self.timestamp_col_name}' "
|
|
60
72
|
f"is zero or negative. Annotation timestamps must be positive values."
|
|
61
73
|
)
|
|
62
|
-
|
|
63
|
-
return f"Invalid timestamp in annotation column '{self.timestamp_col_name}'."
|
|
74
|
+
return f"Invalid timestamp in annotation column '{self.timestamp_col_name}'."
|
|
64
75
|
|
|
65
76
|
|
|
66
77
|
def check_annotation_updated_at_timestamp(
|
|
67
78
|
df: pd.DataFrame,
|
|
68
79
|
col_name: str,
|
|
69
80
|
is_required: bool,
|
|
70
|
-
) ->
|
|
81
|
+
) -> list[InvalidMissingValueInColumn | InvalidAnnotationTimestamp]:
|
|
82
|
+
"""Validates annotation timestamp values for validity and acceptable ranges.
|
|
83
|
+
|
|
84
|
+
Checks that timestamp values are positive, not in the future, and satisfy
|
|
85
|
+
required constraints if specified.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
df: DataFrame containing the annotation timestamp column.
|
|
89
|
+
col_name: Name of the timestamp column to validate.
|
|
90
|
+
is_required: Whether the column must have non-null values in all rows.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of validation errors found (empty if valid).
|
|
94
|
+
"""
|
|
71
95
|
# This check expects that timestamps have previously been converted to milliseconds
|
|
72
96
|
if col_name not in df.columns:
|
|
73
97
|
return []
|
|
@@ -83,7 +107,7 @@ def check_annotation_updated_at_timestamp(
|
|
|
83
107
|
if df[col_name].isnull().all():
|
|
84
108
|
return errors
|
|
85
109
|
|
|
86
|
-
now_ms = datetime.now().timestamp() * 1000
|
|
110
|
+
now_ms = datetime.now(tz=timezone.utc).timestamp() * 1000
|
|
87
111
|
|
|
88
112
|
if df[col_name].max() > now_ms:
|
|
89
113
|
logger.warning(f"Detected future timestamp in column '{col_name}'.")
|
|
@@ -105,7 +129,7 @@ def check_annotation_updated_at_timestamp(
|
|
|
105
129
|
|
|
106
130
|
def check_annotation_cols(
|
|
107
131
|
dataframe: pd.DataFrame,
|
|
108
|
-
) ->
|
|
132
|
+
) -> list[ValidationError]:
|
|
109
133
|
"""Checks value length and validity for columns matching annotation patterns."""
|
|
110
134
|
checks = []
|
|
111
135
|
for col in dataframe.columns:
|
|
@@ -150,7 +174,7 @@ def check_annotation_cols(
|
|
|
150
174
|
|
|
151
175
|
def check_annotation_columns_null_values(
|
|
152
176
|
dataframe: pd.DataFrame,
|
|
153
|
-
) ->
|
|
177
|
+
) -> list[ValidationError]:
|
|
154
178
|
"""Checks that for a given annotation name, at least one of label or score is non-null per row."""
|
|
155
179
|
invalid_annotation_names = []
|
|
156
180
|
annotation_names = set()
|
|
@@ -190,7 +214,7 @@ def check_annotation_columns_null_values(
|
|
|
190
214
|
invalid_annotation_names.append(ann_name)
|
|
191
215
|
|
|
192
216
|
# Use set to report each name only once
|
|
193
|
-
unique_invalid_names = sorted(
|
|
217
|
+
unique_invalid_names = sorted(set(invalid_annotation_names))
|
|
194
218
|
if unique_invalid_names:
|
|
195
219
|
return [
|
|
196
220
|
InvalidNullAnnotationLabelAndScore(
|
|
@@ -202,7 +226,7 @@ def check_annotation_columns_null_values(
|
|
|
202
226
|
|
|
203
227
|
def check_annotation_notes_column(
|
|
204
228
|
dataframe: pd.DataFrame,
|
|
205
|
-
) ->
|
|
229
|
+
) -> list[ValidationError]:
|
|
206
230
|
"""Checks the value length for the optional annotation.notes column (raw string)."""
|
|
207
231
|
col_name = ANNOTATION_NOTES_COLUMN_NAME
|
|
208
232
|
if col_name in dataframe.columns:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Common validation utilities shared across spans validation."""
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
"""Common argument validation utilities for spans."""
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
@@ -7,9 +7,18 @@ from arize.spans.validation.common.errors import InvalidTypeArgument
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def check_field_convertible_to_str(
|
|
10
|
-
project_name:
|
|
11
|
-
model_version:
|
|
12
|
-
) ->
|
|
10
|
+
project_name: object,
|
|
11
|
+
model_version: object = None,
|
|
12
|
+
) -> list[InvalidFieldTypeConversion]:
|
|
13
|
+
"""Validates that field arguments can be converted to strings.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
project_name: The project name value to validate for string conversion.
|
|
17
|
+
model_version: Optional model version value to validate for string conversion.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
List of validation errors for fields that cannot be converted to strings.
|
|
21
|
+
"""
|
|
13
22
|
wrong_fields = []
|
|
14
23
|
if project_name is not None and not isinstance(project_name, str):
|
|
15
24
|
try:
|
|
@@ -28,8 +37,16 @@ def check_field_convertible_to_str(
|
|
|
28
37
|
|
|
29
38
|
|
|
30
39
|
def check_dataframe_type(
|
|
31
|
-
dataframe,
|
|
32
|
-
) ->
|
|
40
|
+
dataframe: object,
|
|
41
|
+
) -> list[InvalidTypeArgument]:
|
|
42
|
+
"""Validates that the provided argument is a pandas DataFrame.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
dataframe: The object to validate as a pandas DataFrame.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
List of validation errors if not a DataFrame (empty if valid).
|
|
49
|
+
"""
|
|
33
50
|
if not isinstance(dataframe, pd.DataFrame):
|
|
34
51
|
return [
|
|
35
52
|
InvalidTypeArgument(
|
|
@@ -42,8 +59,16 @@ def check_dataframe_type(
|
|
|
42
59
|
|
|
43
60
|
|
|
44
61
|
def check_datetime_format_type(
|
|
45
|
-
dt_fmt:
|
|
46
|
-
) ->
|
|
62
|
+
dt_fmt: object,
|
|
63
|
+
) -> list[InvalidTypeArgument]:
|
|
64
|
+
"""Validates that the datetime format argument is a string.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
dt_fmt: The datetime format value to validate.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
List of validation errors if not a string (empty if valid).
|
|
71
|
+
"""
|
|
47
72
|
if not isinstance(dt_fmt, str):
|
|
48
73
|
return [
|
|
49
74
|
InvalidTypeArgument(
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
"""Common DataFrame form validation for spans."""
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
4
6
|
|
|
5
7
|
from arize.exceptions.base import InvalidDataFrameIndex
|
|
6
8
|
from arize.spans.validation.common.errors import (
|
|
@@ -14,7 +16,15 @@ if TYPE_CHECKING:
|
|
|
14
16
|
|
|
15
17
|
def check_dataframe_index(
|
|
16
18
|
dataframe: pd.DataFrame,
|
|
17
|
-
) ->
|
|
19
|
+
) -> list[InvalidDataFrameIndex]:
|
|
20
|
+
"""Validates that the DataFrame has a default integer index.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
dataframe: The DataFrame to validate.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
List of validation errors if index is not default (empty if valid).
|
|
27
|
+
"""
|
|
18
28
|
if (dataframe.index != dataframe.reset_index(drop=True).index).any():
|
|
19
29
|
return [InvalidDataFrameIndex()]
|
|
20
30
|
return []
|
|
@@ -22,13 +32,21 @@ def check_dataframe_index(
|
|
|
22
32
|
|
|
23
33
|
def check_dataframe_required_column_set(
|
|
24
34
|
df: pd.DataFrame,
|
|
25
|
-
required_columns:
|
|
26
|
-
) ->
|
|
35
|
+
required_columns: list[str],
|
|
36
|
+
) -> list[InvalidDataFrameMissingColumns]:
|
|
37
|
+
"""Validates that the DataFrame contains all required columns.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
df: The DataFrame to validate.
|
|
41
|
+
required_columns: List of column names that must be present.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of validation errors for missing columns (empty if valid).
|
|
45
|
+
"""
|
|
27
46
|
existing_columns = set(df.columns)
|
|
28
|
-
missing_cols = [
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
missing_cols.append(col)
|
|
47
|
+
missing_cols = [
|
|
48
|
+
col for col in required_columns if col not in existing_columns
|
|
49
|
+
]
|
|
32
50
|
|
|
33
51
|
if missing_cols:
|
|
34
52
|
return [InvalidDataFrameMissingColumns(missing_cols=missing_cols)]
|
|
@@ -37,7 +55,15 @@ def check_dataframe_required_column_set(
|
|
|
37
55
|
|
|
38
56
|
def check_dataframe_for_duplicate_columns(
|
|
39
57
|
df: pd.DataFrame,
|
|
40
|
-
) ->
|
|
58
|
+
) -> list[InvalidDataFrameDuplicateColumns]:
|
|
59
|
+
"""Validates that the DataFrame has no duplicate column names.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
df: The DataFrame to validate.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
List of validation errors if duplicate columns exist (empty if valid).
|
|
66
|
+
"""
|
|
41
67
|
# Get the duplicated column names from the dataframe
|
|
42
68
|
duplicate_columns = df.columns[df.columns.duplicated()]
|
|
43
69
|
if not duplicate_columns.empty:
|