arize 8.0.0a22__py3-none-any.whl → 8.0.0a23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +17 -9
- arize/_exporter/client.py +55 -36
- arize/_exporter/parsers/tracing_data_parser.py +41 -30
- arize/_exporter/validation.py +3 -3
- arize/_flight/client.py +207 -76
- arize/_generated/api_client/__init__.py +30 -6
- arize/_generated/api_client/api/__init__.py +1 -0
- arize/_generated/api_client/api/datasets_api.py +864 -190
- arize/_generated/api_client/api/experiments_api.py +167 -131
- arize/_generated/api_client/api/projects_api.py +1197 -0
- arize/_generated/api_client/api_client.py +2 -2
- arize/_generated/api_client/configuration.py +42 -34
- arize/_generated/api_client/exceptions.py +2 -2
- arize/_generated/api_client/models/__init__.py +15 -4
- arize/_generated/api_client/models/dataset.py +10 -10
- arize/_generated/api_client/models/dataset_example.py +111 -0
- arize/_generated/api_client/models/dataset_example_update.py +100 -0
- arize/_generated/api_client/models/dataset_version.py +13 -13
- arize/_generated/api_client/models/datasets_create_request.py +16 -8
- arize/_generated/api_client/models/datasets_examples_insert_request.py +100 -0
- arize/_generated/api_client/models/datasets_examples_list200_response.py +106 -0
- arize/_generated/api_client/models/datasets_examples_update_request.py +102 -0
- arize/_generated/api_client/models/datasets_list200_response.py +10 -4
- arize/_generated/api_client/models/experiment.py +14 -16
- arize/_generated/api_client/models/experiment_run.py +108 -0
- arize/_generated/api_client/models/experiment_run_create.py +102 -0
- arize/_generated/api_client/models/experiments_create_request.py +16 -10
- arize/_generated/api_client/models/experiments_list200_response.py +10 -4
- arize/_generated/api_client/models/experiments_runs_list200_response.py +19 -5
- arize/_generated/api_client/models/{error.py → pagination_metadata.py} +13 -11
- arize/_generated/api_client/models/primitive_value.py +172 -0
- arize/_generated/api_client/models/problem.py +100 -0
- arize/_generated/api_client/models/project.py +99 -0
- arize/_generated/api_client/models/{datasets_list_examples200_response.py → projects_create_request.py} +13 -11
- arize/_generated/api_client/models/projects_list200_response.py +106 -0
- arize/_generated/api_client/rest.py +2 -2
- arize/_generated/api_client/test/test_dataset.py +4 -2
- arize/_generated/api_client/test/test_dataset_example.py +56 -0
- arize/_generated/api_client/test/test_dataset_example_update.py +52 -0
- arize/_generated/api_client/test/test_dataset_version.py +7 -2
- arize/_generated/api_client/test/test_datasets_api.py +27 -13
- arize/_generated/api_client/test/test_datasets_create_request.py +8 -4
- arize/_generated/api_client/test/{test_datasets_list_examples200_response.py → test_datasets_examples_insert_request.py} +19 -15
- arize/_generated/api_client/test/test_datasets_examples_list200_response.py +66 -0
- arize/_generated/api_client/test/test_datasets_examples_update_request.py +61 -0
- arize/_generated/api_client/test/test_datasets_list200_response.py +9 -3
- arize/_generated/api_client/test/test_experiment.py +2 -4
- arize/_generated/api_client/test/test_experiment_run.py +56 -0
- arize/_generated/api_client/test/test_experiment_run_create.py +54 -0
- arize/_generated/api_client/test/test_experiments_api.py +6 -6
- arize/_generated/api_client/test/test_experiments_create_request.py +9 -6
- arize/_generated/api_client/test/test_experiments_list200_response.py +9 -5
- arize/_generated/api_client/test/test_experiments_runs_list200_response.py +15 -5
- arize/_generated/api_client/test/test_pagination_metadata.py +53 -0
- arize/_generated/api_client/test/{test_error.py → test_primitive_value.py} +13 -14
- arize/_generated/api_client/test/test_problem.py +57 -0
- arize/_generated/api_client/test/test_project.py +58 -0
- arize/_generated/api_client/test/test_projects_api.py +59 -0
- arize/_generated/api_client/test/test_projects_create_request.py +54 -0
- arize/_generated/api_client/test/test_projects_list200_response.py +70 -0
- arize/_generated/api_client_README.md +43 -29
- arize/_generated/protocol/flight/flight_pb2.py +400 -0
- arize/_lazy.py +27 -19
- arize/client.py +268 -55
- arize/config.py +365 -116
- arize/constants/__init__.py +1 -0
- arize/constants/config.py +11 -4
- arize/constants/ml.py +6 -4
- arize/constants/openinference.py +2 -0
- arize/constants/pyarrow.py +2 -0
- arize/constants/spans.py +3 -1
- arize/datasets/__init__.py +1 -0
- arize/datasets/client.py +299 -84
- arize/datasets/errors.py +32 -2
- arize/datasets/validation.py +18 -8
- arize/embeddings/__init__.py +2 -0
- arize/embeddings/auto_generator.py +23 -19
- arize/embeddings/base_generators.py +89 -36
- arize/embeddings/constants.py +2 -0
- arize/embeddings/cv_generators.py +26 -4
- arize/embeddings/errors.py +27 -5
- arize/embeddings/nlp_generators.py +31 -12
- arize/embeddings/tabular_generators.py +32 -20
- arize/embeddings/usecases.py +12 -2
- arize/exceptions/__init__.py +1 -0
- arize/exceptions/auth.py +11 -1
- arize/exceptions/base.py +29 -4
- arize/exceptions/models.py +21 -2
- arize/exceptions/parameters.py +31 -0
- arize/exceptions/spaces.py +12 -1
- arize/exceptions/types.py +86 -7
- arize/exceptions/values.py +220 -20
- arize/experiments/__init__.py +1 -0
- arize/experiments/client.py +389 -285
- arize/experiments/evaluators/__init__.py +1 -0
- arize/experiments/evaluators/base.py +74 -41
- arize/experiments/evaluators/exceptions.py +6 -3
- arize/experiments/evaluators/executors.py +121 -73
- arize/experiments/evaluators/rate_limiters.py +106 -57
- arize/experiments/evaluators/types.py +34 -7
- arize/experiments/evaluators/utils.py +65 -27
- arize/experiments/functions.py +103 -101
- arize/experiments/tracing.py +52 -44
- arize/experiments/types.py +56 -31
- arize/logging.py +54 -22
- arize/models/__init__.py +1 -0
- arize/models/batch_validation/__init__.py +1 -0
- arize/models/batch_validation/errors.py +543 -65
- arize/models/batch_validation/validator.py +339 -300
- arize/models/bounded_executor.py +20 -7
- arize/models/casting.py +75 -29
- arize/models/client.py +326 -107
- arize/models/proto.py +95 -40
- arize/models/stream_validation.py +42 -14
- arize/models/surrogate_explainer/__init__.py +1 -0
- arize/models/surrogate_explainer/mimic.py +24 -13
- arize/pre_releases.py +43 -0
- arize/projects/__init__.py +1 -0
- arize/projects/client.py +129 -0
- arize/regions.py +40 -0
- arize/spans/__init__.py +1 -0
- arize/spans/client.py +130 -106
- arize/spans/columns.py +13 -0
- arize/spans/conversion.py +54 -38
- arize/spans/validation/__init__.py +1 -0
- arize/spans/validation/annotations/__init__.py +1 -0
- arize/spans/validation/annotations/annotations_validation.py +6 -4
- arize/spans/validation/annotations/dataframe_form_validation.py +13 -11
- arize/spans/validation/annotations/value_validation.py +35 -11
- arize/spans/validation/common/__init__.py +1 -0
- arize/spans/validation/common/argument_validation.py +33 -8
- arize/spans/validation/common/dataframe_form_validation.py +35 -9
- arize/spans/validation/common/errors.py +211 -11
- arize/spans/validation/common/value_validation.py +80 -13
- arize/spans/validation/evals/__init__.py +1 -0
- arize/spans/validation/evals/dataframe_form_validation.py +28 -8
- arize/spans/validation/evals/evals_validation.py +34 -4
- arize/spans/validation/evals/value_validation.py +26 -3
- arize/spans/validation/metadata/__init__.py +1 -1
- arize/spans/validation/metadata/argument_validation.py +14 -5
- arize/spans/validation/metadata/dataframe_form_validation.py +26 -10
- arize/spans/validation/metadata/value_validation.py +24 -10
- arize/spans/validation/spans/__init__.py +1 -0
- arize/spans/validation/spans/dataframe_form_validation.py +34 -13
- arize/spans/validation/spans/spans_validation.py +35 -4
- arize/spans/validation/spans/value_validation.py +76 -7
- arize/types.py +293 -157
- arize/utils/__init__.py +1 -0
- arize/utils/arrow.py +31 -15
- arize/utils/cache.py +34 -6
- arize/utils/dataframe.py +19 -2
- arize/utils/online_tasks/__init__.py +2 -0
- arize/utils/online_tasks/dataframe_preprocessor.py +53 -41
- arize/utils/openinference_conversion.py +44 -5
- arize/utils/proto.py +10 -0
- arize/utils/size.py +5 -3
- arize/version.py +3 -1
- {arize-8.0.0a22.dist-info → arize-8.0.0a23.dist-info}/METADATA +4 -3
- arize-8.0.0a23.dist-info/RECORD +174 -0
- {arize-8.0.0a22.dist-info → arize-8.0.0a23.dist-info}/WHEEL +1 -1
- arize-8.0.0a23.dist-info/licenses/LICENSE +176 -0
- arize-8.0.0a23.dist-info/licenses/NOTICE +13 -0
- arize/_generated/protocol/flight/export_pb2.py +0 -61
- arize/_generated/protocol/flight/ingest_pb2.py +0 -365
- arize-8.0.0a22.dist-info/RECORD +0 -146
- arize-8.0.0a22.dist-info/licenses/LICENSE.md +0 -12
arize/constants/ml.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Machine learning constants and validation limits."""
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
from pathlib import Path
|
|
3
5
|
|
|
@@ -30,7 +32,7 @@ MAX_PROMPT_TEMPLATE_VERSION_LENGTH_TRUNCATION = 50
|
|
|
30
32
|
MAX_NUMBER_OF_EMBEDDINGS = 30
|
|
31
33
|
MAX_EMBEDDING_DIMENSIONALITY = 20_000
|
|
32
34
|
# # The maximum number of classes for multi class
|
|
33
|
-
MAX_NUMBER_OF_MULTI_CLASS_CLASSES =
|
|
35
|
+
MAX_NUMBER_OF_MULTI_CLASS_CLASSES = 500
|
|
34
36
|
MAX_MULTI_CLASS_NAME_LENGTH = 100
|
|
35
37
|
# The maximum number of references in embedding similarity search params
|
|
36
38
|
MAX_NUMBER_OF_SIMILARITY_REFERENCES = 10
|
|
@@ -40,9 +42,9 @@ MAX_NUMBER_OF_SIMILARITY_REFERENCES = 10
|
|
|
40
42
|
# GENERATED_LLM_PARAMS_JSON_COL = "arize_generated_llm_params_json"
|
|
41
43
|
#
|
|
42
44
|
# # reserved columns for LLM run metadata
|
|
43
|
-
LLM_RUN_METADATA_TOTAL_TOKEN_COUNT_TAG_NAME = "total_token_count"
|
|
44
|
-
LLM_RUN_METADATA_PROMPT_TOKEN_COUNT_TAG_NAME = "prompt_token_count"
|
|
45
|
-
LLM_RUN_METADATA_RESPONSE_TOKEN_COUNT_TAG_NAME = "response_token_count"
|
|
45
|
+
LLM_RUN_METADATA_TOTAL_TOKEN_COUNT_TAG_NAME = "total_token_count" # noqa: S105
|
|
46
|
+
LLM_RUN_METADATA_PROMPT_TOKEN_COUNT_TAG_NAME = "prompt_token_count" # noqa: S105
|
|
47
|
+
LLM_RUN_METADATA_RESPONSE_TOKEN_COUNT_TAG_NAME = "response_token_count" # noqa: S105
|
|
46
48
|
LLM_RUN_METADATA_RESPONSE_LATENCY_MS_TAG_NAME = "response_latency_ms"
|
|
47
49
|
#
|
|
48
50
|
# all reserved tags
|
arize/constants/openinference.py
CHANGED
arize/constants/pyarrow.py
CHANGED
arize/constants/spans.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
"""Span-related constants and validation limits for tracing."""
|
|
2
|
+
|
|
3
|
+
# The default format used to parse datetime objects from strings
|
|
2
4
|
DEFAULT_DATETIME_FMT = "%Y-%m-%dT%H:%M:%S.%f+00:00"
|
|
3
5
|
# Minumum/Maximum number of characters for span/trace/parent ids in spans
|
|
4
6
|
SPAN_ID_MIN_STR_LENGTH = 12
|
arize/datasets/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Dataset management and validation utilities for the Arize SDK."""
|
arize/datasets/client.py
CHANGED
|
@@ -1,18 +1,20 @@
|
|
|
1
|
+
"""Client implementation for managing datasets in the Arize platform."""
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
5
|
import logging
|
|
4
6
|
import time
|
|
5
7
|
import uuid
|
|
6
|
-
from typing import
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
7
9
|
|
|
8
10
|
import pandas as pd
|
|
9
11
|
import pyarrow as pa
|
|
10
12
|
|
|
11
13
|
from arize._flight.client import ArizeFlightClient
|
|
12
14
|
from arize._generated.api_client import models
|
|
13
|
-
from arize.config import SDKConfiguration
|
|
14
15
|
from arize.datasets.validation import validate_dataset_df
|
|
15
16
|
from arize.exceptions.base import INVALID_ARROW_CONVERSION_MSG
|
|
17
|
+
from arize.pre_releases import ReleaseStage, prerelease_endpoint
|
|
16
18
|
from arize.utils.cache import cache_resource, load_cached_resource
|
|
17
19
|
from arize.utils.openinference_conversion import (
|
|
18
20
|
convert_boolean_columns_to_str,
|
|
@@ -21,40 +23,116 @@ from arize.utils.openinference_conversion import (
|
|
|
21
23
|
)
|
|
22
24
|
from arize.utils.size import get_payload_size_mb
|
|
23
25
|
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from arize.config import SDKConfiguration
|
|
28
|
+
|
|
24
29
|
logger = logging.getLogger(__name__)
|
|
25
30
|
|
|
26
31
|
|
|
27
32
|
class DatasetsClient:
|
|
28
|
-
|
|
33
|
+
"""Client for managing datasets including creation, retrieval, and example management."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, *, sdk_config: SDKConfiguration) -> None:
|
|
36
|
+
"""Create a datasets sub-client.
|
|
37
|
+
|
|
38
|
+
The datasets client is a thin wrapper around the generated REST API client,
|
|
39
|
+
using the shared generated API client owned by `SDKConfiguration`.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
sdk_config: Resolved SDK configuration.
|
|
43
|
+
"""
|
|
29
44
|
self._sdk_config = sdk_config
|
|
30
45
|
|
|
31
|
-
# Import at runtime so it
|
|
46
|
+
# Import at runtime so it's still lazy and extras-gated by the parent
|
|
32
47
|
from arize._generated import api_client as gen
|
|
33
48
|
|
|
34
49
|
# Use the shared generated client from the config
|
|
35
50
|
self._api = gen.DatasetsApi(self._sdk_config.get_generated_client())
|
|
36
51
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
self
|
|
40
|
-
|
|
52
|
+
@prerelease_endpoint(key="datasets.list", stage=ReleaseStage.BETA)
|
|
53
|
+
def list(
|
|
54
|
+
self,
|
|
55
|
+
*,
|
|
56
|
+
space_id: str | None = None,
|
|
57
|
+
limit: int = 100,
|
|
58
|
+
cursor: str | None = None,
|
|
59
|
+
) -> models.DatasetsList200Response:
|
|
60
|
+
"""List datasets the user has access to.
|
|
61
|
+
|
|
62
|
+
Datasets are returned in descending creation order (most recently created
|
|
63
|
+
first). Dataset versions are not included in this response; use `get()` to
|
|
64
|
+
retrieve a dataset along with its versions.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
space_id: Optional space ID to scope results to a single space.
|
|
68
|
+
limit: Maximum number of datasets to return. The server enforces an
|
|
69
|
+
upper bound.
|
|
70
|
+
cursor: Opaque pagination cursor returned from a previous response.
|
|
41
71
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
self.list_examples = self._list_examples
|
|
72
|
+
Returns:
|
|
73
|
+
A response object with the datasets and pagination information.
|
|
45
74
|
|
|
46
|
-
|
|
75
|
+
Raises:
|
|
76
|
+
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
77
|
+
returns an error response (e.g. 401/403/429).
|
|
78
|
+
"""
|
|
79
|
+
return self._api.datasets_list(
|
|
80
|
+
space_id=space_id,
|
|
81
|
+
limit=limit,
|
|
82
|
+
cursor=cursor,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
@prerelease_endpoint(key="datasets.create", stage=ReleaseStage.BETA)
|
|
86
|
+
def create(
|
|
47
87
|
self,
|
|
48
88
|
*,
|
|
49
89
|
name: str,
|
|
50
90
|
space_id: str,
|
|
51
|
-
examples:
|
|
91
|
+
examples: list[dict[str, object]] | pd.DataFrame,
|
|
52
92
|
force_http: bool = False,
|
|
53
|
-
):
|
|
54
|
-
|
|
93
|
+
) -> models.Dataset:
|
|
94
|
+
"""Create a dataset with JSON examples.
|
|
95
|
+
|
|
96
|
+
Empty datasets are not allowed.
|
|
97
|
+
|
|
98
|
+
Payload notes (server-enforced):
|
|
99
|
+
- `name` must be unique within the given `space_id`.
|
|
100
|
+
- Each example may contain arbitrary user-defined fields.
|
|
101
|
+
- Do not include system-managed fields on create: `id`, `created_at`,
|
|
102
|
+
`updated_at` (requests containing these fields will be rejected).
|
|
103
|
+
- Each example must contain at least one property (i.e. `{}` is invalid).
|
|
104
|
+
|
|
105
|
+
Transport selection:
|
|
106
|
+
- If the payload is below the configured REST payload threshold (or
|
|
107
|
+
`force_http=True`), this method uploads via REST.
|
|
108
|
+
- Otherwise, it attempts a more efficient upload path via gRPC + Flight.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
name: Dataset name (must be unique within the target space).
|
|
112
|
+
space_id: Space ID to create the dataset in.
|
|
113
|
+
examples: Dataset examples either as:
|
|
114
|
+
- a list of JSON-like dicts, or
|
|
115
|
+
- a pandas DataFrame (will be converted to records for REST).
|
|
116
|
+
force_http: If True, force REST upload even if the payload exceeds the
|
|
117
|
+
configured REST payload threshold.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
The created dataset object as returned by the API.
|
|
121
|
+
|
|
122
|
+
Raises:
|
|
123
|
+
TypeError: If `examples` is not a list of dicts or a pandas DataFrame.
|
|
124
|
+
RuntimeError: If the Flight upload path is selected and the Flight request
|
|
125
|
+
fails.
|
|
126
|
+
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
127
|
+
returns an error response (e.g. 400/401/403/409/429).
|
|
128
|
+
"""
|
|
129
|
+
if not isinstance(examples, list | pd.DataFrame):
|
|
55
130
|
raise TypeError(
|
|
56
131
|
"Examples must be a list of dicts or a pandas DataFrame"
|
|
57
132
|
)
|
|
133
|
+
if len(examples) == 0:
|
|
134
|
+
raise ValueError("Cannot create an empty dataset")
|
|
135
|
+
|
|
58
136
|
below_threshold = (
|
|
59
137
|
get_payload_size_mb(examples)
|
|
60
138
|
<= self._sdk_config.max_http_payload_size_mb
|
|
@@ -63,14 +141,14 @@ class DatasetsClient:
|
|
|
63
141
|
from arize._generated import api_client as gen
|
|
64
142
|
|
|
65
143
|
data = (
|
|
66
|
-
examples.to_dict(orient="records")
|
|
144
|
+
examples.to_dict(orient="records") # type: ignore
|
|
67
145
|
if isinstance(examples, pd.DataFrame)
|
|
68
146
|
else examples
|
|
69
147
|
)
|
|
70
148
|
|
|
71
149
|
body = gen.DatasetsCreateRequest(
|
|
72
150
|
name=name,
|
|
73
|
-
|
|
151
|
+
space_id=space_id,
|
|
74
152
|
examples=data,
|
|
75
153
|
)
|
|
76
154
|
return self._api.datasets_create(datasets_create_request=body)
|
|
@@ -93,76 +171,83 @@ class DatasetsClient:
|
|
|
93
171
|
examples=data,
|
|
94
172
|
)
|
|
95
173
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
space_id: str,
|
|
100
|
-
examples: pd.DataFrame,
|
|
101
|
-
):
|
|
102
|
-
data = examples.copy()
|
|
103
|
-
# Convert datetime columns to int64 (ms since epoch)
|
|
104
|
-
data = convert_datetime_columns_to_int(data)
|
|
105
|
-
data = convert_boolean_columns_to_str(data)
|
|
106
|
-
data = _set_default_columns_for_dataset(data)
|
|
107
|
-
data = convert_default_columns_to_json_str(data)
|
|
174
|
+
@prerelease_endpoint(key="datasets.get", stage=ReleaseStage.BETA)
|
|
175
|
+
def get(self, *, dataset_id: str) -> models.Dataset:
|
|
176
|
+
"""Get a dataset by ID.
|
|
108
177
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
178
|
+
The returned dataset includes its dataset versions (sorted by creation time,
|
|
179
|
+
most recent first). Dataset examples are not included; use `list_examples()`
|
|
180
|
+
to retrieve examples.
|
|
112
181
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
logger.debug("Converting data to Arrow format")
|
|
116
|
-
pa_table = pa.Table.from_pandas(data, preserve_index=False)
|
|
117
|
-
except pa.ArrowInvalid as e:
|
|
118
|
-
logger.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
|
|
119
|
-
raise pa.ArrowInvalid(
|
|
120
|
-
f"Error converting to Arrow format: {str(e)}"
|
|
121
|
-
) from e
|
|
122
|
-
except Exception as e:
|
|
123
|
-
logger.error(f"Unexpected error creating Arrow table: {str(e)}")
|
|
124
|
-
raise
|
|
182
|
+
Args:
|
|
183
|
+
dataset_id: Dataset ID to retrieve.
|
|
125
184
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
api_key=self._sdk_config.api_key,
|
|
129
|
-
host=self._sdk_config.flight_server_host,
|
|
130
|
-
port=self._sdk_config.flight_server_port,
|
|
131
|
-
scheme=self._sdk_config.flight_scheme,
|
|
132
|
-
request_verify=self._sdk_config.request_verify,
|
|
133
|
-
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
134
|
-
) as flight_client:
|
|
135
|
-
try:
|
|
136
|
-
response = flight_client.create_dataset(
|
|
137
|
-
space_id=space_id,
|
|
138
|
-
dataset_name=name,
|
|
139
|
-
pa_table=pa_table,
|
|
140
|
-
)
|
|
141
|
-
except Exception as e:
|
|
142
|
-
msg = f"Error during update request: {str(e)}"
|
|
143
|
-
logger.error(msg)
|
|
144
|
-
raise RuntimeError(msg) from e
|
|
145
|
-
if response is None:
|
|
146
|
-
# This should not happen with proper Flight client implementation,
|
|
147
|
-
# but we handle it defensively
|
|
148
|
-
msg = "No response received from flight server during update"
|
|
149
|
-
logger.error(msg)
|
|
150
|
-
raise RuntimeError(msg)
|
|
151
|
-
# The response from flightserver is the dataset ID. To return the dataset
|
|
152
|
-
# object we make a GET query
|
|
153
|
-
dataset = self.get(dataset_id=response)
|
|
154
|
-
return dataset
|
|
185
|
+
Returns:
|
|
186
|
+
The dataset object.
|
|
155
187
|
|
|
156
|
-
|
|
188
|
+
Raises:
|
|
189
|
+
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
190
|
+
returns an error response (e.g. 401/403/404/429).
|
|
191
|
+
"""
|
|
192
|
+
return self._api.datasets_get(dataset_id=dataset_id)
|
|
193
|
+
|
|
194
|
+
@prerelease_endpoint(key="datasets.delete", stage=ReleaseStage.BETA)
|
|
195
|
+
def delete(self, *, dataset_id: str) -> None:
|
|
196
|
+
"""Delete a dataset by ID.
|
|
197
|
+
|
|
198
|
+
This operation is irreversible.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
dataset_id: Dataset ID to delete.
|
|
202
|
+
|
|
203
|
+
Returns: This method returns None on success (common empty 204 response)
|
|
204
|
+
|
|
205
|
+
Raises:
|
|
206
|
+
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
207
|
+
returns an error response (e.g. 401/403/404/429).
|
|
208
|
+
"""
|
|
209
|
+
return self._api.datasets_delete(dataset_id=dataset_id)
|
|
210
|
+
|
|
211
|
+
@prerelease_endpoint(key="datasets.list_examples", stage=ReleaseStage.BETA)
|
|
212
|
+
def list_examples(
|
|
157
213
|
self,
|
|
158
214
|
*,
|
|
159
215
|
dataset_id: str,
|
|
160
216
|
dataset_version_id: str = "",
|
|
161
217
|
limit: int = 100,
|
|
162
218
|
all: bool = False,
|
|
163
|
-
):
|
|
219
|
+
) -> models.DatasetsExamplesList200Response:
|
|
220
|
+
"""List examples for a dataset (optionally for a specific version).
|
|
221
|
+
|
|
222
|
+
If `dataset_version_id` is not provided (empty string), the server selects
|
|
223
|
+
the latest dataset version.
|
|
224
|
+
|
|
225
|
+
Pagination notes:
|
|
226
|
+
- The response includes `pagination` for forward compatibility.
|
|
227
|
+
- Cursor pagination may not be fully implemented by the server yet.
|
|
228
|
+
- If `all=True`, this method retrieves all examples via the Flight path,
|
|
229
|
+
and returns them in a single response with `has_more=False`.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
dataset_id: Dataset ID to list examples for.
|
|
233
|
+
dataset_version_id: Dataset version ID. If empty, the latest version is
|
|
234
|
+
selected.
|
|
235
|
+
limit: Maximum number of examples to return when `all=False`. The server
|
|
236
|
+
enforces an upper bound.
|
|
237
|
+
all: If True, fetch all examples (ignores `limit`) via Flight and return a
|
|
238
|
+
single response.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
A response object containing `examples` and `pagination` metadata.
|
|
242
|
+
|
|
243
|
+
Raises:
|
|
244
|
+
RuntimeError: If the Flight request fails or returns no response when
|
|
245
|
+
`all=True`.
|
|
246
|
+
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
247
|
+
returns an error response when `all=False` (e.g. 401/403/404/429).
|
|
248
|
+
"""
|
|
164
249
|
if not all:
|
|
165
|
-
return self._api.
|
|
250
|
+
return self._api.datasets_examples_list(
|
|
166
251
|
dataset_id=dataset_id,
|
|
167
252
|
dataset_version_id=dataset_version_id,
|
|
168
253
|
limit=limit,
|
|
@@ -184,14 +269,17 @@ class DatasetsClient:
|
|
|
184
269
|
resource_updated_at=dataset_updated_at,
|
|
185
270
|
)
|
|
186
271
|
if dataset_df is not None:
|
|
187
|
-
return models.
|
|
188
|
-
examples=dataset_df.to_dict(orient="records")
|
|
272
|
+
return models.DatasetsExamplesList200Response(
|
|
273
|
+
examples=dataset_df.to_dict(orient="records"), # type: ignore
|
|
274
|
+
pagination=models.PaginationMetadata(
|
|
275
|
+
has_more=False, # Note that all=True
|
|
276
|
+
),
|
|
189
277
|
)
|
|
190
278
|
|
|
191
279
|
with ArizeFlightClient(
|
|
192
280
|
api_key=self._sdk_config.api_key,
|
|
193
|
-
host=self._sdk_config.
|
|
194
|
-
port=self._sdk_config.
|
|
281
|
+
host=self._sdk_config.flight_host,
|
|
282
|
+
port=self._sdk_config.flight_port,
|
|
195
283
|
scheme=self._sdk_config.flight_scheme,
|
|
196
284
|
request_verify=self._sdk_config.request_verify,
|
|
197
285
|
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
@@ -203,8 +291,8 @@ class DatasetsClient:
|
|
|
203
291
|
dataset_version_id=dataset_version_id,
|
|
204
292
|
)
|
|
205
293
|
except Exception as e:
|
|
206
|
-
msg = f"Error during request: {
|
|
207
|
-
logger.
|
|
294
|
+
msg = f"Error during request: {e!s}"
|
|
295
|
+
logger.exception(msg)
|
|
208
296
|
raise RuntimeError(msg) from e
|
|
209
297
|
if dataset_df is None:
|
|
210
298
|
# This should not happen with proper Flight client implementation,
|
|
@@ -222,12 +310,139 @@ class DatasetsClient:
|
|
|
222
310
|
resource_data=dataset_df,
|
|
223
311
|
)
|
|
224
312
|
|
|
225
|
-
return models.
|
|
226
|
-
examples=dataset_df.to_dict(orient="records")
|
|
313
|
+
return models.DatasetsExamplesList200Response(
|
|
314
|
+
examples=dataset_df.to_dict(orient="records"), # type: ignore
|
|
315
|
+
pagination=models.PaginationMetadata(
|
|
316
|
+
has_more=False, # Note that all=True
|
|
317
|
+
),
|
|
227
318
|
)
|
|
228
319
|
|
|
320
|
+
# TODO(Kiko): Needs flightserver support
|
|
321
|
+
@prerelease_endpoint(
|
|
322
|
+
key="datasets.append_examples", stage=ReleaseStage.BETA
|
|
323
|
+
)
|
|
324
|
+
def append_examples(
|
|
325
|
+
self,
|
|
326
|
+
*,
|
|
327
|
+
dataset_id: str,
|
|
328
|
+
dataset_version_id: str = "",
|
|
329
|
+
examples: list[dict[str, object]] | pd.DataFrame,
|
|
330
|
+
) -> models.Dataset:
|
|
331
|
+
"""Append new examples to an existing dataset.
|
|
332
|
+
|
|
333
|
+
This method adds examples to an existing dataset version. If
|
|
334
|
+
`dataset_version_id` is not provided (empty string), the server appends
|
|
335
|
+
the examples to the latest dataset version.
|
|
336
|
+
|
|
337
|
+
The inserted examples are assigned system-generated IDs by the server.
|
|
338
|
+
|
|
339
|
+
Payload requirements (server-enforced):
|
|
340
|
+
- Each example may contain arbitrary user-defined fields.
|
|
341
|
+
- Do not include system-managed fields on input: `id`, `created_at`,
|
|
342
|
+
`updated_at` (requests containing these fields will be rejected).
|
|
343
|
+
- Each example must contain at least one property (i.e. empty
|
|
344
|
+
examples are not invalid).
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
dataset_id: Dataset ID to append examples to.
|
|
348
|
+
dataset_version_id: Optional dataset version ID to append examples to. If empty,
|
|
349
|
+
the latest dataset version is selected.
|
|
350
|
+
examples: Examples to append, provided as either:
|
|
351
|
+
- a list of JSON-like dicts, or
|
|
352
|
+
- a pandas DataFrame (converted to records before upload).
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
The updated dataset object. To see the examples, use `list_examples()`.
|
|
356
|
+
|
|
357
|
+
Raises:
|
|
358
|
+
AssertionError: If `examples` is not a list of dicts or a pandas
|
|
359
|
+
DataFrame.
|
|
360
|
+
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
361
|
+
returns an error response (e.g. 400/401/403/404/429).
|
|
362
|
+
"""
|
|
363
|
+
from arize._generated import api_client as gen
|
|
364
|
+
|
|
365
|
+
if not isinstance(examples, list | pd.DataFrame):
|
|
366
|
+
raise TypeError(
|
|
367
|
+
"Examples must be a list of dicts or a pandas DataFrame"
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
data = (
|
|
371
|
+
examples.to_dict(orient="records") # type: ignore
|
|
372
|
+
if isinstance(examples, pd.DataFrame)
|
|
373
|
+
else examples
|
|
374
|
+
)
|
|
375
|
+
body = gen.DatasetsExamplesInsertRequest(examples=data)
|
|
376
|
+
|
|
377
|
+
return self._api.datasets_examples_insert(
|
|
378
|
+
dataset_id=dataset_id,
|
|
379
|
+
dataset_version_id=dataset_version_id,
|
|
380
|
+
datasets_examples_insert_request=body,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
def _create_dataset_via_flight(
|
|
384
|
+
self,
|
|
385
|
+
name: str,
|
|
386
|
+
space_id: str,
|
|
387
|
+
examples: pd.DataFrame,
|
|
388
|
+
) -> object:
|
|
389
|
+
"""Internal method to create a dataset using Flight protocol for large example sets."""
|
|
390
|
+
data = examples.copy()
|
|
391
|
+
# Convert datetime columns to int64 (ms since epoch)
|
|
392
|
+
data = convert_datetime_columns_to_int(data)
|
|
393
|
+
data = convert_boolean_columns_to_str(data)
|
|
394
|
+
data = _set_default_columns_for_dataset(data)
|
|
395
|
+
data = convert_default_columns_to_json_str(data)
|
|
396
|
+
|
|
397
|
+
validation_errors = validate_dataset_df(data)
|
|
398
|
+
if validation_errors:
|
|
399
|
+
raise RuntimeError([e.error_message() for e in validation_errors])
|
|
400
|
+
|
|
401
|
+
# Convert to Arrow table
|
|
402
|
+
try:
|
|
403
|
+
logger.debug("Converting data to Arrow format")
|
|
404
|
+
pa_table = pa.Table.from_pandas(data, preserve_index=False)
|
|
405
|
+
except pa.ArrowInvalid as e:
|
|
406
|
+
logger.exception(INVALID_ARROW_CONVERSION_MSG)
|
|
407
|
+
raise pa.ArrowInvalid(
|
|
408
|
+
f"Error converting to Arrow format: {e!s}"
|
|
409
|
+
) from e
|
|
410
|
+
except Exception:
|
|
411
|
+
logger.exception("Unexpected error creating Arrow table")
|
|
412
|
+
raise
|
|
413
|
+
|
|
414
|
+
response = None
|
|
415
|
+
with ArizeFlightClient(
|
|
416
|
+
api_key=self._sdk_config.api_key,
|
|
417
|
+
host=self._sdk_config.flight_host,
|
|
418
|
+
port=self._sdk_config.flight_port,
|
|
419
|
+
scheme=self._sdk_config.flight_scheme,
|
|
420
|
+
request_verify=self._sdk_config.request_verify,
|
|
421
|
+
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
422
|
+
) as flight_client:
|
|
423
|
+
try:
|
|
424
|
+
response = flight_client.create_dataset(
|
|
425
|
+
space_id=space_id,
|
|
426
|
+
dataset_name=name,
|
|
427
|
+
pa_table=pa_table,
|
|
428
|
+
)
|
|
429
|
+
except Exception as e:
|
|
430
|
+
msg = f"Error during create request: {e!s}"
|
|
431
|
+
logger.exception(msg)
|
|
432
|
+
raise RuntimeError(msg) from e
|
|
433
|
+
if response is None:
|
|
434
|
+
# This should not happen with proper Flight client implementation,
|
|
435
|
+
# but we handle it defensively
|
|
436
|
+
msg = "No response received from flight server during update"
|
|
437
|
+
logger.error(msg)
|
|
438
|
+
raise RuntimeError(msg)
|
|
439
|
+
# The response from flightserver is the dataset ID. To return the dataset
|
|
440
|
+
# object we make a GET query
|
|
441
|
+
return self.get(dataset_id=response)
|
|
442
|
+
|
|
229
443
|
|
|
230
444
|
def _set_default_columns_for_dataset(df: pd.DataFrame) -> pd.DataFrame:
|
|
445
|
+
"""Set default values for created_at and updated_at columns if missing or null."""
|
|
231
446
|
current_time = int(time.time() * 1000)
|
|
232
447
|
if "created_at" in df.columns:
|
|
233
448
|
if df["created_at"].isnull().values.any(): # type: ignore
|
arize/datasets/errors.py
CHANGED
|
@@ -1,21 +1,29 @@
|
|
|
1
|
+
"""Dataset-specific exception classes."""
|
|
2
|
+
|
|
1
3
|
from abc import ABC, abstractmethod
|
|
2
4
|
|
|
3
5
|
|
|
4
6
|
class DatasetError(Exception, ABC):
|
|
7
|
+
"""Base exception for dataset-related errors."""
|
|
8
|
+
|
|
5
9
|
def __str__(self) -> str:
|
|
10
|
+
"""Return a human-readable error message."""
|
|
6
11
|
return self.error_message()
|
|
7
12
|
|
|
8
13
|
@abstractmethod
|
|
9
14
|
def __repr__(self) -> str:
|
|
10
|
-
|
|
15
|
+
"""Return a string representation for debugging and logging."""
|
|
11
16
|
|
|
12
17
|
@abstractmethod
|
|
13
18
|
def error_message(self) -> str:
|
|
14
|
-
|
|
19
|
+
"""Return the error message for this exception."""
|
|
15
20
|
|
|
16
21
|
|
|
17
22
|
class InvalidSessionError(DatasetError):
|
|
23
|
+
"""Raised when credentials are not provided or invalid."""
|
|
24
|
+
|
|
18
25
|
def error_message(self) -> str:
|
|
26
|
+
"""Return the error message for this exception."""
|
|
19
27
|
return (
|
|
20
28
|
"Credentials not provided or invalid. Please pass in the correct api_key when "
|
|
21
29
|
"initiating a new ArizeExportClient. Alternatively, you can set up credentials "
|
|
@@ -23,39 +31,61 @@ class InvalidSessionError(DatasetError):
|
|
|
23
31
|
)
|
|
24
32
|
|
|
25
33
|
def __repr__(self) -> str:
|
|
34
|
+
"""Return a string representation for debugging and logging."""
|
|
26
35
|
return "InvalidSessionError()"
|
|
27
36
|
|
|
28
37
|
|
|
29
38
|
class InvalidConfigFileError(DatasetError):
|
|
39
|
+
"""Raised when configuration file is invalid or misconfigured."""
|
|
40
|
+
|
|
30
41
|
def error_message(self) -> str:
|
|
42
|
+
"""Return the error message for this exception."""
|
|
31
43
|
return "Invalid/Misconfigured Configuration File"
|
|
32
44
|
|
|
33
45
|
def __repr__(self) -> str:
|
|
46
|
+
"""Return a string representation for debugging and logging."""
|
|
34
47
|
return "InvalidConfigFileError()"
|
|
35
48
|
|
|
36
49
|
|
|
37
50
|
class IDColumnUniqueConstraintError(DatasetError):
|
|
51
|
+
"""Raised when id column contains duplicate values."""
|
|
52
|
+
|
|
38
53
|
def error_message(self) -> str:
|
|
54
|
+
"""Return the error message for this exception."""
|
|
39
55
|
return "'id' column must contain unique values"
|
|
40
56
|
|
|
41
57
|
def __repr__(self) -> str:
|
|
58
|
+
"""Return a string representation for debugging and logging."""
|
|
42
59
|
return "IDColumnUniqueConstraintError()"
|
|
43
60
|
|
|
44
61
|
|
|
45
62
|
class RequiredColumnsError(DatasetError):
|
|
63
|
+
"""Raised when required columns are missing from the dataset."""
|
|
64
|
+
|
|
46
65
|
def __init__(self, missing_columns: set) -> None:
|
|
66
|
+
"""Initialize the exception with missing columns context.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
missing_columns: Set of required columns that are missing.
|
|
70
|
+
"""
|
|
47
71
|
self.missing_columns = missing_columns
|
|
48
72
|
|
|
49
73
|
def error_message(self) -> str:
|
|
74
|
+
"""Return the error message for this exception."""
|
|
50
75
|
return f"Missing required columns: {self.missing_columns}"
|
|
51
76
|
|
|
52
77
|
def __repr__(self) -> str:
|
|
78
|
+
"""Return a string representation for debugging and logging."""
|
|
53
79
|
return f"RequiredColumnsError({self.missing_columns})"
|
|
54
80
|
|
|
55
81
|
|
|
56
82
|
class EmptyDatasetError(DatasetError):
|
|
83
|
+
"""Raised when dataset DataFrame has no rows."""
|
|
84
|
+
|
|
57
85
|
def error_message(self) -> str:
|
|
86
|
+
"""Return the error message for this exception."""
|
|
58
87
|
return "DataFrame must have at least one row in it."
|
|
59
88
|
|
|
60
89
|
def __repr__(self) -> str:
|
|
90
|
+
"""Return a string representation for debugging and logging."""
|
|
61
91
|
return "EmptyDatasetError()"
|