arize 8.0.0a22__py3-none-any.whl → 8.0.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +28 -19
- arize/_exporter/client.py +56 -37
- arize/_exporter/parsers/tracing_data_parser.py +41 -30
- arize/_exporter/validation.py +3 -3
- arize/_flight/client.py +207 -76
- arize/_generated/api_client/__init__.py +30 -6
- arize/_generated/api_client/api/__init__.py +1 -0
- arize/_generated/api_client/api/datasets_api.py +864 -190
- arize/_generated/api_client/api/experiments_api.py +167 -131
- arize/_generated/api_client/api/projects_api.py +1197 -0
- arize/_generated/api_client/api_client.py +2 -2
- arize/_generated/api_client/configuration.py +42 -34
- arize/_generated/api_client/exceptions.py +2 -2
- arize/_generated/api_client/models/__init__.py +15 -4
- arize/_generated/api_client/models/dataset.py +10 -10
- arize/_generated/api_client/models/dataset_example.py +111 -0
- arize/_generated/api_client/models/dataset_example_update.py +100 -0
- arize/_generated/api_client/models/dataset_version.py +13 -13
- arize/_generated/api_client/models/datasets_create_request.py +16 -8
- arize/_generated/api_client/models/datasets_examples_insert_request.py +100 -0
- arize/_generated/api_client/models/datasets_examples_list200_response.py +106 -0
- arize/_generated/api_client/models/datasets_examples_update_request.py +102 -0
- arize/_generated/api_client/models/datasets_list200_response.py +10 -4
- arize/_generated/api_client/models/experiment.py +14 -16
- arize/_generated/api_client/models/experiment_run.py +108 -0
- arize/_generated/api_client/models/experiment_run_create.py +102 -0
- arize/_generated/api_client/models/experiments_create_request.py +16 -10
- arize/_generated/api_client/models/experiments_list200_response.py +10 -4
- arize/_generated/api_client/models/experiments_runs_list200_response.py +19 -5
- arize/_generated/api_client/models/{error.py → pagination_metadata.py} +13 -11
- arize/_generated/api_client/models/primitive_value.py +172 -0
- arize/_generated/api_client/models/problem.py +100 -0
- arize/_generated/api_client/models/project.py +99 -0
- arize/_generated/api_client/models/{datasets_list_examples200_response.py → projects_create_request.py} +13 -11
- arize/_generated/api_client/models/projects_list200_response.py +106 -0
- arize/_generated/api_client/rest.py +2 -2
- arize/_generated/api_client/test/test_dataset.py +4 -2
- arize/_generated/api_client/test/test_dataset_example.py +56 -0
- arize/_generated/api_client/test/test_dataset_example_update.py +52 -0
- arize/_generated/api_client/test/test_dataset_version.py +7 -2
- arize/_generated/api_client/test/test_datasets_api.py +27 -13
- arize/_generated/api_client/test/test_datasets_create_request.py +8 -4
- arize/_generated/api_client/test/{test_datasets_list_examples200_response.py → test_datasets_examples_insert_request.py} +19 -15
- arize/_generated/api_client/test/test_datasets_examples_list200_response.py +66 -0
- arize/_generated/api_client/test/test_datasets_examples_update_request.py +61 -0
- arize/_generated/api_client/test/test_datasets_list200_response.py +9 -3
- arize/_generated/api_client/test/test_experiment.py +2 -4
- arize/_generated/api_client/test/test_experiment_run.py +56 -0
- arize/_generated/api_client/test/test_experiment_run_create.py +54 -0
- arize/_generated/api_client/test/test_experiments_api.py +6 -6
- arize/_generated/api_client/test/test_experiments_create_request.py +9 -6
- arize/_generated/api_client/test/test_experiments_list200_response.py +9 -5
- arize/_generated/api_client/test/test_experiments_runs_list200_response.py +15 -5
- arize/_generated/api_client/test/test_pagination_metadata.py +53 -0
- arize/_generated/api_client/test/{test_error.py → test_primitive_value.py} +13 -14
- arize/_generated/api_client/test/test_problem.py +57 -0
- arize/_generated/api_client/test/test_project.py +58 -0
- arize/_generated/api_client/test/test_projects_api.py +59 -0
- arize/_generated/api_client/test/test_projects_create_request.py +54 -0
- arize/_generated/api_client/test/test_projects_list200_response.py +70 -0
- arize/_generated/api_client_README.md +43 -29
- arize/_generated/protocol/flight/flight_pb2.py +400 -0
- arize/_lazy.py +27 -19
- arize/client.py +181 -58
- arize/config.py +324 -116
- arize/constants/__init__.py +1 -0
- arize/constants/config.py +11 -4
- arize/constants/ml.py +6 -4
- arize/constants/openinference.py +2 -0
- arize/constants/pyarrow.py +2 -0
- arize/constants/spans.py +3 -1
- arize/datasets/__init__.py +1 -0
- arize/datasets/client.py +304 -84
- arize/datasets/errors.py +32 -2
- arize/datasets/validation.py +18 -8
- arize/embeddings/__init__.py +2 -0
- arize/embeddings/auto_generator.py +23 -19
- arize/embeddings/base_generators.py +89 -36
- arize/embeddings/constants.py +2 -0
- arize/embeddings/cv_generators.py +26 -4
- arize/embeddings/errors.py +27 -5
- arize/embeddings/nlp_generators.py +43 -18
- arize/embeddings/tabular_generators.py +46 -31
- arize/embeddings/usecases.py +12 -2
- arize/exceptions/__init__.py +1 -0
- arize/exceptions/auth.py +11 -1
- arize/exceptions/base.py +29 -4
- arize/exceptions/models.py +21 -2
- arize/exceptions/parameters.py +31 -0
- arize/exceptions/spaces.py +12 -1
- arize/exceptions/types.py +86 -7
- arize/exceptions/values.py +220 -20
- arize/experiments/__init__.py +13 -0
- arize/experiments/client.py +394 -285
- arize/experiments/evaluators/__init__.py +1 -0
- arize/experiments/evaluators/base.py +74 -41
- arize/experiments/evaluators/exceptions.py +6 -3
- arize/experiments/evaluators/executors.py +121 -73
- arize/experiments/evaluators/rate_limiters.py +106 -57
- arize/experiments/evaluators/types.py +34 -7
- arize/experiments/evaluators/utils.py +65 -27
- arize/experiments/functions.py +103 -101
- arize/experiments/tracing.py +52 -44
- arize/experiments/types.py +56 -31
- arize/logging.py +54 -22
- arize/ml/__init__.py +1 -0
- arize/ml/batch_validation/__init__.py +1 -0
- arize/{models → ml}/batch_validation/errors.py +545 -67
- arize/{models → ml}/batch_validation/validator.py +344 -303
- arize/ml/bounded_executor.py +47 -0
- arize/{models → ml}/casting.py +118 -108
- arize/{models → ml}/client.py +339 -118
- arize/{models → ml}/proto.py +97 -42
- arize/{models → ml}/stream_validation.py +43 -15
- arize/ml/surrogate_explainer/__init__.py +1 -0
- arize/{models → ml}/surrogate_explainer/mimic.py +25 -10
- arize/{types.py → ml/types.py} +355 -354
- arize/pre_releases.py +44 -0
- arize/projects/__init__.py +1 -0
- arize/projects/client.py +134 -0
- arize/regions.py +40 -0
- arize/spans/__init__.py +1 -0
- arize/spans/client.py +204 -175
- arize/spans/columns.py +13 -0
- arize/spans/conversion.py +60 -37
- arize/spans/validation/__init__.py +1 -0
- arize/spans/validation/annotations/__init__.py +1 -0
- arize/spans/validation/annotations/annotations_validation.py +6 -4
- arize/spans/validation/annotations/dataframe_form_validation.py +13 -11
- arize/spans/validation/annotations/value_validation.py +35 -11
- arize/spans/validation/common/__init__.py +1 -0
- arize/spans/validation/common/argument_validation.py +33 -8
- arize/spans/validation/common/dataframe_form_validation.py +35 -9
- arize/spans/validation/common/errors.py +211 -11
- arize/spans/validation/common/value_validation.py +81 -14
- arize/spans/validation/evals/__init__.py +1 -0
- arize/spans/validation/evals/dataframe_form_validation.py +28 -8
- arize/spans/validation/evals/evals_validation.py +34 -4
- arize/spans/validation/evals/value_validation.py +26 -3
- arize/spans/validation/metadata/__init__.py +1 -1
- arize/spans/validation/metadata/argument_validation.py +14 -5
- arize/spans/validation/metadata/dataframe_form_validation.py +26 -10
- arize/spans/validation/metadata/value_validation.py +24 -10
- arize/spans/validation/spans/__init__.py +1 -0
- arize/spans/validation/spans/dataframe_form_validation.py +35 -14
- arize/spans/validation/spans/spans_validation.py +35 -4
- arize/spans/validation/spans/value_validation.py +78 -8
- arize/utils/__init__.py +1 -0
- arize/utils/arrow.py +31 -15
- arize/utils/cache.py +34 -6
- arize/utils/dataframe.py +20 -3
- arize/utils/online_tasks/__init__.py +2 -0
- arize/utils/online_tasks/dataframe_preprocessor.py +58 -47
- arize/utils/openinference_conversion.py +44 -5
- arize/utils/proto.py +10 -0
- arize/utils/size.py +5 -3
- arize/utils/types.py +105 -0
- arize/version.py +3 -1
- {arize-8.0.0a22.dist-info → arize-8.0.0b0.dist-info}/METADATA +13 -6
- arize-8.0.0b0.dist-info/RECORD +175 -0
- {arize-8.0.0a22.dist-info → arize-8.0.0b0.dist-info}/WHEEL +1 -1
- arize-8.0.0b0.dist-info/licenses/LICENSE +176 -0
- arize-8.0.0b0.dist-info/licenses/NOTICE +13 -0
- arize/_generated/protocol/flight/export_pb2.py +0 -61
- arize/_generated/protocol/flight/ingest_pb2.py +0 -365
- arize/models/__init__.py +0 -0
- arize/models/batch_validation/__init__.py +0 -0
- arize/models/bounded_executor.py +0 -34
- arize/models/surrogate_explainer/__init__.py +0 -0
- arize-8.0.0a22.dist-info/RECORD +0 -146
- arize-8.0.0a22.dist-info/licenses/LICENSE.md +0 -12
|
@@ -1,10 +1,13 @@
|
|
|
1
|
+
"""Value validation logic for span data."""
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
5
|
from itertools import chain
|
|
4
|
-
from typing import TYPE_CHECKING
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
5
7
|
|
|
6
8
|
from arize.constants import spans as tracing_constants
|
|
7
9
|
from arize.constants.ml import MAX_EMBEDDING_DIMENSIONALITY
|
|
10
|
+
from arize.ml.types import StatusCodes
|
|
8
11
|
from arize.spans import columns as tracing_cols
|
|
9
12
|
from arize.spans.validation.common import value_validation
|
|
10
13
|
from arize.spans.validation.common.errors import (
|
|
@@ -13,7 +16,7 @@ from arize.spans.validation.common.errors import (
|
|
|
13
16
|
InvalidEventValueInColumn,
|
|
14
17
|
InvalidLLMMessageValueInColumn,
|
|
15
18
|
)
|
|
16
|
-
from arize.types import
|
|
19
|
+
from arize.utils.types import is_dict_of, is_json_str
|
|
17
20
|
|
|
18
21
|
if TYPE_CHECKING:
|
|
19
22
|
import pandas as pd
|
|
@@ -23,7 +26,18 @@ if TYPE_CHECKING:
|
|
|
23
26
|
|
|
24
27
|
def check_span_root_field_values(
|
|
25
28
|
dataframe: pd.DataFrame,
|
|
26
|
-
) ->
|
|
29
|
+
) -> list[ValidationError]:
|
|
30
|
+
"""Validates root-level span field values for proper format and constraints.
|
|
31
|
+
|
|
32
|
+
Checks span ID, trace ID, parent span ID, name, status code, status message,
|
|
33
|
+
timestamps, and events for conformance to specification limits.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
dataframe: The DataFrame containing span data.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
List of validation errors found in root span fields.
|
|
40
|
+
"""
|
|
27
41
|
return list(
|
|
28
42
|
chain(
|
|
29
43
|
value_validation.check_string_column_value_length(
|
|
@@ -77,7 +91,18 @@ def check_span_root_field_values(
|
|
|
77
91
|
|
|
78
92
|
def check_span_attributes_values(
|
|
79
93
|
dataframe: pd.DataFrame,
|
|
80
|
-
) ->
|
|
94
|
+
) -> list[ValidationError]:
|
|
95
|
+
"""Validates span attribute values for proper format and constraints.
|
|
96
|
+
|
|
97
|
+
Checks all span attributes including LLM parameters, embeddings, documents,
|
|
98
|
+
tools, and other metadata fields for conformance to specification limits.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
dataframe: The DataFrame containing span data.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
List of validation errors found in span attributes.
|
|
105
|
+
"""
|
|
81
106
|
return list(
|
|
82
107
|
chain(
|
|
83
108
|
value_validation.check_string_column_value_length(
|
|
@@ -242,7 +267,17 @@ def check_span_attributes_values(
|
|
|
242
267
|
|
|
243
268
|
def check_event_column_value(
|
|
244
269
|
df: pd.DataFrame,
|
|
245
|
-
) ->
|
|
270
|
+
) -> list[InvalidEventValueInColumn]:
|
|
271
|
+
"""Validates span event column values for proper format and length constraints.
|
|
272
|
+
|
|
273
|
+
Checks event names for length limits and attributes for proper dictionary structure.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
df: The DataFrame containing span events.
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
List of validation errors found in event column values.
|
|
280
|
+
"""
|
|
246
281
|
col_name = tracing_cols.SPAN_EVENTS_COL.name
|
|
247
282
|
if col_name not in df.columns:
|
|
248
283
|
return []
|
|
@@ -284,7 +319,18 @@ def check_event_column_value(
|
|
|
284
319
|
|
|
285
320
|
def check_embeddings_column_value(
|
|
286
321
|
df: pd.DataFrame,
|
|
287
|
-
) ->
|
|
322
|
+
) -> list[InvalidEmbeddingValueInColumn]:
|
|
323
|
+
"""Validates embedding column values for proper vector dimensions and text length.
|
|
324
|
+
|
|
325
|
+
Checks that embedding vectors are within dimensionality limits and text
|
|
326
|
+
values don't exceed maximum length.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
df: The DataFrame containing embedding data.
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
List of validation errors found in embedding column values.
|
|
333
|
+
"""
|
|
288
334
|
col_name = tracing_cols.SPAN_ATTRIBUTES_EMBEDDING_EMBEDDINGS_COL.name
|
|
289
335
|
if col_name not in df.columns:
|
|
290
336
|
return []
|
|
@@ -332,7 +378,19 @@ def check_embeddings_column_value(
|
|
|
332
378
|
def check_LLM_IO_messages_column_value(
|
|
333
379
|
df: pd.DataFrame,
|
|
334
380
|
col_name: str,
|
|
335
|
-
) ->
|
|
381
|
+
) -> list[InvalidLLMMessageValueInColumn]:
|
|
382
|
+
"""Validates LLM input/output message column values for proper format and length.
|
|
383
|
+
|
|
384
|
+
Checks message role, content, and tool calls for conformance to length limits
|
|
385
|
+
and proper JSON formatting.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
df: The DataFrame containing LLM messages.
|
|
389
|
+
col_name: Name of the message column to validate.
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
List of validation errors found in message column values.
|
|
393
|
+
"""
|
|
336
394
|
if col_name not in df.columns:
|
|
337
395
|
return []
|
|
338
396
|
|
|
@@ -407,7 +465,19 @@ def check_LLM_IO_messages_column_value(
|
|
|
407
465
|
def check_documents_column_value(
|
|
408
466
|
df: pd.DataFrame,
|
|
409
467
|
col_name: str,
|
|
410
|
-
) ->
|
|
468
|
+
) -> list[InvalidDocumentValueInColumn]:
|
|
469
|
+
"""Validates document column values for proper format and length constraints.
|
|
470
|
+
|
|
471
|
+
Checks document ID, content, and metadata for conformance to length limits
|
|
472
|
+
and proper data type requirements.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
df: The DataFrame containing documents.
|
|
476
|
+
col_name: Name of the document column to validate.
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
List of validation errors found in document column values.
|
|
480
|
+
"""
|
|
411
481
|
if col_name not in df.columns:
|
|
412
482
|
return []
|
|
413
483
|
|
arize/utils/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Utility functions and helper modules for the Arize SDK."""
|
arize/utils/arrow.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Apache Arrow utilities for data serialization and file operations."""
|
|
2
|
+
|
|
1
3
|
# type: ignore[pb2]
|
|
2
4
|
from __future__ import annotations
|
|
3
5
|
|
|
@@ -5,7 +7,7 @@ import base64
|
|
|
5
7
|
import logging
|
|
6
8
|
import os
|
|
7
9
|
import tempfile
|
|
8
|
-
from typing import TYPE_CHECKING, Any
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
9
11
|
|
|
10
12
|
import pyarrow as pa
|
|
11
13
|
|
|
@@ -23,16 +25,30 @@ def post_arrow_table(
|
|
|
23
25
|
files_url: str,
|
|
24
26
|
pa_table: pa.Table,
|
|
25
27
|
proto_schema: pb2.Schema,
|
|
26
|
-
headers:
|
|
28
|
+
headers: dict[str, str],
|
|
27
29
|
timeout: float | None,
|
|
28
30
|
verify: bool,
|
|
29
31
|
max_chunksize: int,
|
|
30
32
|
tmp_dir: str = "",
|
|
31
33
|
) -> requests.Response:
|
|
32
|
-
|
|
34
|
+
"""Post a PyArrow table to Arize via HTTP file upload.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
files_url: The URL endpoint for file uploads.
|
|
38
|
+
pa_table: The PyArrow table containing the data.
|
|
39
|
+
proto_schema: The protobuf schema for the data.
|
|
40
|
+
headers: HTTP headers for the request.
|
|
41
|
+
timeout: Request timeout in seconds, or None for no timeout.
|
|
42
|
+
verify: Whether to verify SSL certificates.
|
|
43
|
+
max_chunksize: Maximum chunk size for splitting large tables.
|
|
44
|
+
tmp_dir: Temporary directory for serialization. Defaults to "".
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
The HTTP response from the upload request.
|
|
48
|
+
"""
|
|
49
|
+
# We import here to avoid depending on requests for all arrow utils
|
|
33
50
|
import requests
|
|
34
51
|
|
|
35
|
-
logger.debug("Preparing to log Arrow table via file upload")
|
|
36
52
|
logger.debug(
|
|
37
53
|
"Preparing to log Arrow table via file upload",
|
|
38
54
|
extra={"rows": pa_table.num_rows, "cols": pa_table.num_columns},
|
|
@@ -94,20 +110,20 @@ def post_arrow_table(
|
|
|
94
110
|
tdir.cleanup() # cleaning the entire dir, no need to clean the file
|
|
95
111
|
except Exception as e:
|
|
96
112
|
logger.warning(
|
|
97
|
-
f"Failed to remove temporary directory {tdir.name}: {
|
|
113
|
+
f"Failed to remove temporary directory {tdir.name}: {e!s}"
|
|
98
114
|
)
|
|
99
115
|
elif cleanup_file:
|
|
100
116
|
try:
|
|
101
117
|
os.remove(outfile)
|
|
102
118
|
except Exception as e:
|
|
103
119
|
logger.warning(
|
|
104
|
-
f"Failed to remove temporary file {outfile}: {
|
|
120
|
+
f"Failed to remove temporary file {outfile}: {e!s}"
|
|
105
121
|
)
|
|
106
122
|
|
|
107
123
|
|
|
108
124
|
def _append_to_pyarrow_metadata(
|
|
109
|
-
pa_schema: pa.Schema, new_metadata:
|
|
110
|
-
):
|
|
125
|
+
pa_schema: pa.Schema, new_metadata: dict[str, Any]
|
|
126
|
+
) -> object:
|
|
111
127
|
# Ensure metadata is handled correctly, even if initially None.
|
|
112
128
|
metadata = pa_schema.metadata
|
|
113
129
|
if metadata is None:
|
|
@@ -129,9 +145,10 @@ def _append_to_pyarrow_metadata(
|
|
|
129
145
|
def _write_arrow_file(
|
|
130
146
|
path: str, pa_table: pa.Table, pa_schema: pa.Schema, max_chunksize: int
|
|
131
147
|
) -> None:
|
|
132
|
-
with
|
|
133
|
-
sink,
|
|
134
|
-
|
|
148
|
+
with (
|
|
149
|
+
pa.OSFile(path, mode="wb") as sink,
|
|
150
|
+
pa.ipc.RecordBatchStreamWriter(sink, pa_schema) as writer,
|
|
151
|
+
):
|
|
135
152
|
writer.write_table(pa_table, max_chunksize)
|
|
136
153
|
|
|
137
154
|
|
|
@@ -145,10 +162,9 @@ def _maybe_log_project_url(response: requests.Response) -> None:
|
|
|
145
162
|
|
|
146
163
|
|
|
147
164
|
def _mktemp_in(directory: str) -> str:
|
|
148
|
-
"""
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
disk and is closed; caller can open/write it later.
|
|
165
|
+
"""Create a unique temp file path inside `directory` without leaving an open file descriptor.
|
|
166
|
+
|
|
167
|
+
Windows-safe. The file exists on disk and is closed; caller can open/write it later.
|
|
152
168
|
"""
|
|
153
169
|
with tempfile.NamedTemporaryFile(
|
|
154
170
|
dir=directory,
|
arize/utils/cache.py
CHANGED
|
@@ -1,10 +1,16 @@
|
|
|
1
|
+
"""Caching utilities for resource management and persistence."""
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
5
|
import logging
|
|
4
6
|
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
5
8
|
|
|
6
9
|
import pandas as pd
|
|
7
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
|
|
8
14
|
logger = logging.getLogger(__name__)
|
|
9
15
|
|
|
10
16
|
|
|
@@ -12,9 +18,21 @@ def load_cached_resource(
|
|
|
12
18
|
cache_dir: str,
|
|
13
19
|
resource: str,
|
|
14
20
|
resource_id: str,
|
|
15
|
-
resource_updated_at:
|
|
21
|
+
resource_updated_at: datetime | None,
|
|
16
22
|
format: str = "parquet",
|
|
17
23
|
) -> pd.DataFrame | None:
|
|
24
|
+
"""Load a cached resource from the local cache directory.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
cache_dir: Directory path for cache storage.
|
|
28
|
+
resource: Resource type name (e.g., "dataset", "experiment").
|
|
29
|
+
resource_id: Unique identifier for the resource.
|
|
30
|
+
resource_updated_at: Optional timestamp of last resource update.
|
|
31
|
+
format: File format for cached data. Defaults to "parquet".
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
The cached DataFrame if found and valid, None otherwise.
|
|
35
|
+
"""
|
|
18
36
|
key = _get_cache_key(resource, resource_id, resource_updated_at)
|
|
19
37
|
filepath = _get_abs_file_path(cache_dir, f"{key}.{format}", resource)
|
|
20
38
|
if not filepath.exists():
|
|
@@ -30,10 +48,20 @@ def cache_resource(
|
|
|
30
48
|
cache_dir: str,
|
|
31
49
|
resource: str,
|
|
32
50
|
resource_id: str,
|
|
33
|
-
resource_updated_at:
|
|
51
|
+
resource_updated_at: datetime | None,
|
|
34
52
|
resource_data: pd.DataFrame,
|
|
35
53
|
format: str = "parquet",
|
|
36
54
|
) -> None:
|
|
55
|
+
"""Save a resource to the local cache directory.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
cache_dir: Directory path for cache storage.
|
|
59
|
+
resource: Resource type name (e.g., "dataset", "experiment").
|
|
60
|
+
resource_id: Unique identifier for the resource.
|
|
61
|
+
resource_updated_at: Optional timestamp of last resource update.
|
|
62
|
+
resource_data: DataFrame containing the resource data.
|
|
63
|
+
format: File format for cached data. Defaults to "parquet".
|
|
64
|
+
"""
|
|
37
65
|
key = _get_cache_key(resource, resource_id, resource_updated_at)
|
|
38
66
|
filepath = _get_abs_file_path(cache_dir, f"{key}.{format}", resource)
|
|
39
67
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -44,12 +72,12 @@ def cache_resource(
|
|
|
44
72
|
def _get_cache_key(
|
|
45
73
|
resource: str,
|
|
46
74
|
resource_id: str,
|
|
47
|
-
resource_updated_at:
|
|
75
|
+
resource_updated_at: datetime | None,
|
|
48
76
|
) -> str:
|
|
49
77
|
# include updated_at if present to produce a new key when dataset changes
|
|
50
78
|
key = f"{resource}_{resource_id}"
|
|
51
79
|
if resource_updated_at:
|
|
52
|
-
key += f"_{resource_updated_at}"
|
|
80
|
+
key += f"_{resource_updated_at.strftime('%Y%m%dT%H%M%S')}"
|
|
53
81
|
return key
|
|
54
82
|
|
|
55
83
|
|
|
@@ -58,8 +86,8 @@ def _get_abs_file_path(
|
|
|
58
86
|
filename: str,
|
|
59
87
|
subdirectory: str | None = None,
|
|
60
88
|
) -> Path:
|
|
61
|
-
"""
|
|
62
|
-
|
|
89
|
+
"""Return an absolute path to a file located under `directory[/subdirectory]/filename`.
|
|
90
|
+
|
|
63
91
|
Expands '~' and resolves relative components.
|
|
64
92
|
"""
|
|
65
93
|
base = Path(directory).expanduser()
|
arize/utils/dataframe.py
CHANGED
|
@@ -1,13 +1,19 @@
|
|
|
1
|
+
"""DataFrame manipulation and validation utilities."""
|
|
2
|
+
|
|
1
3
|
import re
|
|
2
|
-
from typing import List
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
5
6
|
|
|
6
|
-
from arize.types import BaseSchema
|
|
7
|
+
from arize.ml.types import BaseSchema
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
# Resets the dataframe index if it is not a RangeIndex
|
|
10
11
|
def reset_dataframe_index(dataframe: pd.DataFrame) -> None:
|
|
12
|
+
"""Reset the DataFrame index in-place if it is not a RangeIndex.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
dataframe: The pandas DataFrame to reset.
|
|
16
|
+
"""
|
|
11
17
|
if not isinstance(dataframe.index, pd.RangeIndex):
|
|
12
18
|
drop = dataframe.index.name in dataframe.columns
|
|
13
19
|
dataframe.reset_index(inplace=True, drop=drop)
|
|
@@ -16,9 +22,20 @@ def reset_dataframe_index(dataframe: pd.DataFrame) -> None:
|
|
|
16
22
|
def remove_extraneous_columns(
|
|
17
23
|
df: pd.DataFrame,
|
|
18
24
|
schema: BaseSchema | None = None,
|
|
19
|
-
column_list:
|
|
25
|
+
column_list: list[str] | None = None,
|
|
20
26
|
regex: str | None = None,
|
|
21
27
|
) -> pd.DataFrame:
|
|
28
|
+
"""Filter DataFrame to keep only relevant columns based on schema, list, or regex.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
df: The pandas DataFrame to filter.
|
|
32
|
+
schema: Optional schema defining used columns. Defaults to None.
|
|
33
|
+
column_list: Optional explicit list of columns to keep. Defaults to None.
|
|
34
|
+
regex: Optional regex pattern to match column names. Defaults to None.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
A filtered DataFrame containing only the relevant columns.
|
|
38
|
+
"""
|
|
22
39
|
relevant_columns = set()
|
|
23
40
|
if schema is not None:
|
|
24
41
|
relevant_columns.update(schema.get_used_columns())
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
"""DataFrame preprocessing utilities for online tasks."""
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
3
|
-
from typing import Any, List, Tuple
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
7
|
import pandas as pd
|
|
@@ -8,17 +9,34 @@ import pandas as pd
|
|
|
8
9
|
logger = logging.getLogger(__name__)
|
|
9
10
|
|
|
10
11
|
|
|
12
|
+
class ColumnNotFoundError(Exception):
|
|
13
|
+
"""Raised when a specified column is not found in the DataFrame."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, attribute: str) -> None:
|
|
16
|
+
"""Initialize with the attribute that couldn't be mapped to a column.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
attribute: The attribute string that has no matching column prefix.
|
|
20
|
+
"""
|
|
21
|
+
self.attribute = attribute
|
|
22
|
+
super().__init__(
|
|
23
|
+
f"No column found in DataFrame for attribute: {attribute}"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
11
27
|
def extract_nested_data_to_column(
|
|
12
|
-
attributes:
|
|
28
|
+
attributes: list[str], df: pd.DataFrame
|
|
13
29
|
) -> pd.DataFrame:
|
|
14
|
-
"""
|
|
30
|
+
"""Extract nested attributes from complex data structures into new DataFrame columns.
|
|
31
|
+
|
|
15
32
|
This function, used in Online Tasks, is typically run on data exported from Arize.
|
|
16
|
-
It prepares the DataFrame by extracting relevant attributes from complex, deeply
|
|
17
|
-
data structures, such as those found in LLM outputs or JSON-like records.
|
|
18
|
-
specific values from these nested structures by identifying the
|
|
19
|
-
in the DataFrame and recursively accessing the desired
|
|
20
|
-
This preprocessing step ensures that the extracted
|
|
21
|
-
allowing evaluators to process and assess
|
|
33
|
+
It prepares the DataFrame by extracting relevant attributes from complex, deeply
|
|
34
|
+
nested data structures, such as those found in LLM outputs or JSON-like records.
|
|
35
|
+
It helps extract specific values from these nested structures by identifying the
|
|
36
|
+
longest matching column name in the DataFrame and recursively accessing the desired
|
|
37
|
+
attribute path within each row. This preprocessing step ensures that the extracted
|
|
38
|
+
values are available as new columns, allowing evaluators to process and assess
|
|
39
|
+
these values effectively.
|
|
22
40
|
|
|
23
41
|
For each attributes string in `attributes` (e.g. "attributes.llm.output_messages.0.message.content"),
|
|
24
42
|
1) Find the largest prefix that is actually a column name in `df`. (e.g. "attributes.llm.output_messages")
|
|
@@ -37,13 +55,12 @@ def extract_nested_data_to_column(
|
|
|
37
55
|
5) Log how many rows were dropped and, if zero rows remain, log a message indicating that
|
|
38
56
|
there are no rows satisfying *all* of the queries.
|
|
39
57
|
"""
|
|
40
|
-
|
|
41
58
|
# Make a copy so as not to alter the input df
|
|
42
59
|
result_df = df.copy()
|
|
43
60
|
|
|
44
61
|
# Keep track of which new columns we add. Each column name will match each user-inputted attribute
|
|
45
62
|
# (e.g. "attributes.llm.output_messages.0.message.content")
|
|
46
|
-
new_cols:
|
|
63
|
+
new_cols: list[str] = []
|
|
47
64
|
|
|
48
65
|
for attribute in attributes:
|
|
49
66
|
parts = attribute.split(".")
|
|
@@ -58,7 +75,7 @@ def extract_nested_data_to_column(
|
|
|
58
75
|
prefix_len = i
|
|
59
76
|
|
|
60
77
|
if prefix_col is None:
|
|
61
|
-
raise
|
|
78
|
+
raise ColumnNotFoundError(attribute)
|
|
62
79
|
|
|
63
80
|
# 2) The remainder after the prefix
|
|
64
81
|
remainder = ".".join(parts[prefix_len:])
|
|
@@ -68,13 +85,14 @@ def extract_nested_data_to_column(
|
|
|
68
85
|
row: pd.Series,
|
|
69
86
|
prefix_col: str = prefix_col,
|
|
70
87
|
remainder: str = remainder,
|
|
71
|
-
) ->
|
|
88
|
+
) -> object:
|
|
72
89
|
val = row[prefix_col]
|
|
73
90
|
try:
|
|
74
91
|
result = _introspect_arize_attribute(val, remainder)
|
|
75
|
-
return result if result is not None else np.nan
|
|
76
92
|
except Exception:
|
|
77
93
|
return np.nan
|
|
94
|
+
else:
|
|
95
|
+
return result if result is not None else np.nan
|
|
78
96
|
|
|
79
97
|
result_df[attribute] = result_df.apply(
|
|
80
98
|
apply_introspect_arize_attribute, axis=1
|
|
@@ -101,10 +119,10 @@ def extract_nested_data_to_column(
|
|
|
101
119
|
return result_df
|
|
102
120
|
|
|
103
121
|
|
|
104
|
-
def _introspect_arize_attribute(value:
|
|
105
|
-
"""
|
|
106
|
-
|
|
107
|
-
|
|
122
|
+
def _introspect_arize_attribute(value: object, attribute: str) -> object:
|
|
123
|
+
"""Recursively drill into `value` following the dot-delimited `attribute`.
|
|
124
|
+
|
|
125
|
+
Examples:
|
|
108
126
|
value: [{'message.role': 'assistant', 'message.content': 'The capital of China is Beijing.'}]
|
|
109
127
|
attribute: "0.message.content"
|
|
110
128
|
Returns: 'The capital of China is Beijing.'
|
|
@@ -114,7 +132,6 @@ def _introspect_arize_attribute(value: Any, attribute: str) -> Any:
|
|
|
114
132
|
- Parses JSON strings
|
|
115
133
|
- Converts NumPy arrays to lists
|
|
116
134
|
- Allows dotted keys (e.g. "message.content") by combining parts
|
|
117
|
-
|
|
118
135
|
"""
|
|
119
136
|
if not attribute:
|
|
120
137
|
return value
|
|
@@ -124,8 +141,8 @@ def _introspect_arize_attribute(value: Any, attribute: str) -> Any:
|
|
|
124
141
|
|
|
125
142
|
|
|
126
143
|
def _introspect_arize_attribute_parts(
|
|
127
|
-
current_value:
|
|
128
|
-
) ->
|
|
144
|
+
current_value: object, attribute_parts_unprocessed: list[str]
|
|
145
|
+
) -> object:
|
|
129
146
|
# If no more parts, we return whatever we have
|
|
130
147
|
if not attribute_parts_unprocessed:
|
|
131
148
|
return current_value
|
|
@@ -148,10 +165,9 @@ def _introspect_arize_attribute_parts(
|
|
|
148
165
|
|
|
149
166
|
|
|
150
167
|
def _parse_value(
|
|
151
|
-
current_value:
|
|
152
|
-
) ->
|
|
153
|
-
"""
|
|
154
|
-
Attempt to parse out the next value from `current_value` using the earliest parts:
|
|
168
|
+
current_value: object, attribute_parts_unprocessed: list[str]
|
|
169
|
+
) -> tuple[object, int]:
|
|
170
|
+
"""Attempt to parse out the next value from `current_value` using the earliest parts.
|
|
155
171
|
|
|
156
172
|
1) If `attribute_parts_unprocessed[0]` is an integer index and `current_value` is a list/tuple,
|
|
157
173
|
index into it.
|
|
@@ -164,7 +180,6 @@ def _parse_value(
|
|
|
164
180
|
- parsed_value: the found value or None if not found
|
|
165
181
|
- num_parts_processed: how many parts were processed (1 or more)
|
|
166
182
|
"""
|
|
167
|
-
|
|
168
183
|
if not attribute_parts_unprocessed:
|
|
169
184
|
return (None, 0)
|
|
170
185
|
|
|
@@ -179,38 +194,34 @@ def _parse_value(
|
|
|
179
194
|
idx = _try_int(key)
|
|
180
195
|
if idx is not None:
|
|
181
196
|
# Must be a tuple or list (_ensure_deserialized() already casts numpy arrays to python lists)
|
|
182
|
-
if isinstance(current_value,
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
else:
|
|
188
|
-
return (None, num_parts_processed)
|
|
197
|
+
if isinstance(current_value, list | tuple) and 0 <= idx < len(
|
|
198
|
+
current_value
|
|
199
|
+
):
|
|
200
|
+
return (current_value[idx], num_parts_processed)
|
|
201
|
+
return (None, num_parts_processed)
|
|
189
202
|
|
|
190
203
|
# 2) Try dict approach
|
|
191
204
|
if isinstance(current_value, dict):
|
|
192
205
|
# a) direct match
|
|
193
206
|
if key in current_value:
|
|
194
207
|
return (current_value[key], num_parts_processed)
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
num_parts_processed + 1,
|
|
205
|
-
)
|
|
206
|
-
return (None, num_parts_processed)
|
|
208
|
+
# b) try combining multiple parts to handle dotted key
|
|
209
|
+
for num_parts_processed in range(1, len(attribute_parts_unprocessed)):
|
|
210
|
+
key += "." + attribute_parts_unprocessed[num_parts_processed]
|
|
211
|
+
if key in current_value:
|
|
212
|
+
return (
|
|
213
|
+
current_value[key],
|
|
214
|
+
num_parts_processed + 1,
|
|
215
|
+
)
|
|
216
|
+
return (None, num_parts_processed)
|
|
207
217
|
|
|
208
218
|
# If we get here, we couldn't handle it (not a list or dict or mismatch)
|
|
209
219
|
return (None, num_parts_processed)
|
|
210
220
|
|
|
211
221
|
|
|
212
|
-
def _ensure_deserialized(val:
|
|
213
|
-
"""
|
|
222
|
+
def _ensure_deserialized(val: object) -> object:
|
|
223
|
+
"""Ensure value is deserialized from numpy array or JSON string.
|
|
224
|
+
|
|
214
225
|
1) If `val` is a numpy array, convert to a Python list.
|
|
215
226
|
2) If `val` is a string, attempt to parse as JSON.
|
|
216
227
|
3) Otherwise return as-is.
|
|
@@ -1,11 +1,24 @@
|
|
|
1
|
+
"""OpenInference data conversion utilities for column transformations."""
|
|
2
|
+
|
|
1
3
|
import json
|
|
4
|
+
import logging
|
|
2
5
|
|
|
3
6
|
import pandas as pd
|
|
4
7
|
|
|
5
8
|
from arize.constants.openinference import OPEN_INFERENCE_JSON_STR_TYPES
|
|
6
9
|
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
7
12
|
|
|
8
13
|
def convert_datetime_columns_to_int(df: pd.DataFrame) -> pd.DataFrame:
|
|
14
|
+
"""Convert datetime columns in a DataFrame to milliseconds since epoch.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
df: The pandas DataFrame to convert.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
The DataFrame with datetime columns converted to integers.
|
|
21
|
+
"""
|
|
9
22
|
for col in df.select_dtypes(
|
|
10
23
|
include=["datetime64[ns]", "datetime64[ns, UTC]"]
|
|
11
24
|
):
|
|
@@ -14,6 +27,14 @@ def convert_datetime_columns_to_int(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
14
27
|
|
|
15
28
|
|
|
16
29
|
def convert_boolean_columns_to_str(df: pd.DataFrame) -> pd.DataFrame:
|
|
30
|
+
"""Convert boolean columns in a DataFrame to string type.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
df: The pandas DataFrame to convert.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
The DataFrame with boolean columns converted to strings.
|
|
37
|
+
"""
|
|
17
38
|
for col in df.columns:
|
|
18
39
|
if df[col].dtype == "bool":
|
|
19
40
|
df[col] = df[col].astype("string")
|
|
@@ -21,33 +42,51 @@ def convert_boolean_columns_to_str(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
21
42
|
|
|
22
43
|
|
|
23
44
|
def convert_default_columns_to_json_str(df: pd.DataFrame) -> pd.DataFrame:
|
|
45
|
+
"""Convert dictionary values in specific columns to JSON strings.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
df: The pandas DataFrame to convert.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
The DataFrame with dictionaries in eligible columns converted to JSON strings.
|
|
52
|
+
"""
|
|
24
53
|
for col in df.columns:
|
|
25
54
|
if _should_convert_json(col):
|
|
26
55
|
try:
|
|
27
56
|
df[col] = df[col].apply(
|
|
28
57
|
lambda x: json.dumps(x) if isinstance(x, dict) else x
|
|
29
58
|
)
|
|
30
|
-
except Exception:
|
|
59
|
+
except Exception as e:
|
|
60
|
+
logger.debug(
|
|
61
|
+
f"Failed to convert column '{col}' to JSON string: {e}"
|
|
62
|
+
)
|
|
31
63
|
continue
|
|
32
64
|
return df
|
|
33
65
|
|
|
34
66
|
|
|
35
67
|
def convert_json_str_to_dict(df: pd.DataFrame) -> pd.DataFrame:
|
|
68
|
+
"""Convert JSON string values in specific columns to Python dictionaries.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
df: The pandas DataFrame to convert.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
The DataFrame with JSON strings in eligible columns converted to dictionaries.
|
|
75
|
+
"""
|
|
36
76
|
for col in df.columns:
|
|
37
77
|
if _should_convert_json(col):
|
|
38
78
|
try:
|
|
39
79
|
df[col] = df[col].apply(
|
|
40
80
|
lambda x: json.loads(x) if isinstance(x, str) else x
|
|
41
81
|
)
|
|
42
|
-
except Exception:
|
|
82
|
+
except Exception as e:
|
|
83
|
+
logger.debug(f"Failed to parse column '{col}' as JSON: {e}")
|
|
43
84
|
continue
|
|
44
85
|
return df
|
|
45
86
|
|
|
46
87
|
|
|
47
88
|
def _should_convert_json(col_name: str) -> bool:
|
|
48
|
-
"""
|
|
49
|
-
Check if a column should be converted to/from a JSON string/PythonDictionary.
|
|
50
|
-
"""
|
|
89
|
+
"""Check if a column should be converted to/from a JSON string/PythonDictionary."""
|
|
51
90
|
is_eval_metadata = col_name.startswith("eval.") and col_name.endswith(
|
|
52
91
|
".metadata"
|
|
53
92
|
)
|