arize 8.0.0a22__py3-none-any.whl → 8.0.0b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +28 -19
- arize/_exporter/client.py +56 -37
- arize/_exporter/parsers/tracing_data_parser.py +41 -30
- arize/_exporter/validation.py +3 -3
- arize/_flight/client.py +207 -76
- arize/_generated/api_client/__init__.py +30 -6
- arize/_generated/api_client/api/__init__.py +1 -0
- arize/_generated/api_client/api/datasets_api.py +864 -190
- arize/_generated/api_client/api/experiments_api.py +167 -131
- arize/_generated/api_client/api/projects_api.py +1197 -0
- arize/_generated/api_client/api_client.py +2 -2
- arize/_generated/api_client/configuration.py +42 -34
- arize/_generated/api_client/exceptions.py +2 -2
- arize/_generated/api_client/models/__init__.py +15 -4
- arize/_generated/api_client/models/dataset.py +10 -10
- arize/_generated/api_client/models/dataset_example.py +111 -0
- arize/_generated/api_client/models/dataset_example_update.py +100 -0
- arize/_generated/api_client/models/dataset_version.py +13 -13
- arize/_generated/api_client/models/datasets_create_request.py +16 -8
- arize/_generated/api_client/models/datasets_examples_insert_request.py +100 -0
- arize/_generated/api_client/models/datasets_examples_list200_response.py +106 -0
- arize/_generated/api_client/models/datasets_examples_update_request.py +102 -0
- arize/_generated/api_client/models/datasets_list200_response.py +10 -4
- arize/_generated/api_client/models/experiment.py +14 -16
- arize/_generated/api_client/models/experiment_run.py +108 -0
- arize/_generated/api_client/models/experiment_run_create.py +102 -0
- arize/_generated/api_client/models/experiments_create_request.py +16 -10
- arize/_generated/api_client/models/experiments_list200_response.py +10 -4
- arize/_generated/api_client/models/experiments_runs_list200_response.py +19 -5
- arize/_generated/api_client/models/{error.py → pagination_metadata.py} +13 -11
- arize/_generated/api_client/models/primitive_value.py +172 -0
- arize/_generated/api_client/models/problem.py +100 -0
- arize/_generated/api_client/models/project.py +99 -0
- arize/_generated/api_client/models/{datasets_list_examples200_response.py → projects_create_request.py} +13 -11
- arize/_generated/api_client/models/projects_list200_response.py +106 -0
- arize/_generated/api_client/rest.py +2 -2
- arize/_generated/api_client/test/test_dataset.py +4 -2
- arize/_generated/api_client/test/test_dataset_example.py +56 -0
- arize/_generated/api_client/test/test_dataset_example_update.py +52 -0
- arize/_generated/api_client/test/test_dataset_version.py +7 -2
- arize/_generated/api_client/test/test_datasets_api.py +27 -13
- arize/_generated/api_client/test/test_datasets_create_request.py +8 -4
- arize/_generated/api_client/test/{test_datasets_list_examples200_response.py → test_datasets_examples_insert_request.py} +19 -15
- arize/_generated/api_client/test/test_datasets_examples_list200_response.py +66 -0
- arize/_generated/api_client/test/test_datasets_examples_update_request.py +61 -0
- arize/_generated/api_client/test/test_datasets_list200_response.py +9 -3
- arize/_generated/api_client/test/test_experiment.py +2 -4
- arize/_generated/api_client/test/test_experiment_run.py +56 -0
- arize/_generated/api_client/test/test_experiment_run_create.py +54 -0
- arize/_generated/api_client/test/test_experiments_api.py +6 -6
- arize/_generated/api_client/test/test_experiments_create_request.py +9 -6
- arize/_generated/api_client/test/test_experiments_list200_response.py +9 -5
- arize/_generated/api_client/test/test_experiments_runs_list200_response.py +15 -5
- arize/_generated/api_client/test/test_pagination_metadata.py +53 -0
- arize/_generated/api_client/test/{test_error.py → test_primitive_value.py} +13 -14
- arize/_generated/api_client/test/test_problem.py +57 -0
- arize/_generated/api_client/test/test_project.py +58 -0
- arize/_generated/api_client/test/test_projects_api.py +59 -0
- arize/_generated/api_client/test/test_projects_create_request.py +54 -0
- arize/_generated/api_client/test/test_projects_list200_response.py +70 -0
- arize/_generated/api_client_README.md +43 -29
- arize/_generated/protocol/flight/flight_pb2.py +400 -0
- arize/_lazy.py +27 -19
- arize/client.py +181 -58
- arize/config.py +324 -116
- arize/constants/__init__.py +1 -0
- arize/constants/config.py +11 -4
- arize/constants/ml.py +6 -4
- arize/constants/openinference.py +2 -0
- arize/constants/pyarrow.py +2 -0
- arize/constants/spans.py +3 -1
- arize/datasets/__init__.py +1 -0
- arize/datasets/client.py +304 -84
- arize/datasets/errors.py +32 -2
- arize/datasets/validation.py +18 -8
- arize/embeddings/__init__.py +2 -0
- arize/embeddings/auto_generator.py +23 -19
- arize/embeddings/base_generators.py +89 -36
- arize/embeddings/constants.py +2 -0
- arize/embeddings/cv_generators.py +26 -4
- arize/embeddings/errors.py +27 -5
- arize/embeddings/nlp_generators.py +43 -18
- arize/embeddings/tabular_generators.py +46 -31
- arize/embeddings/usecases.py +12 -2
- arize/exceptions/__init__.py +1 -0
- arize/exceptions/auth.py +11 -1
- arize/exceptions/base.py +29 -4
- arize/exceptions/models.py +21 -2
- arize/exceptions/parameters.py +31 -0
- arize/exceptions/spaces.py +12 -1
- arize/exceptions/types.py +86 -7
- arize/exceptions/values.py +220 -20
- arize/experiments/__init__.py +13 -0
- arize/experiments/client.py +394 -285
- arize/experiments/evaluators/__init__.py +1 -0
- arize/experiments/evaluators/base.py +74 -41
- arize/experiments/evaluators/exceptions.py +6 -3
- arize/experiments/evaluators/executors.py +121 -73
- arize/experiments/evaluators/rate_limiters.py +106 -57
- arize/experiments/evaluators/types.py +34 -7
- arize/experiments/evaluators/utils.py +65 -27
- arize/experiments/functions.py +103 -101
- arize/experiments/tracing.py +52 -44
- arize/experiments/types.py +56 -31
- arize/logging.py +54 -22
- arize/ml/__init__.py +1 -0
- arize/ml/batch_validation/__init__.py +1 -0
- arize/{models → ml}/batch_validation/errors.py +545 -67
- arize/{models → ml}/batch_validation/validator.py +344 -303
- arize/ml/bounded_executor.py +47 -0
- arize/{models → ml}/casting.py +118 -108
- arize/{models → ml}/client.py +339 -118
- arize/{models → ml}/proto.py +97 -42
- arize/{models → ml}/stream_validation.py +43 -15
- arize/ml/surrogate_explainer/__init__.py +1 -0
- arize/{models → ml}/surrogate_explainer/mimic.py +25 -10
- arize/{types.py → ml/types.py} +355 -354
- arize/pre_releases.py +44 -0
- arize/projects/__init__.py +1 -0
- arize/projects/client.py +134 -0
- arize/regions.py +40 -0
- arize/spans/__init__.py +1 -0
- arize/spans/client.py +204 -175
- arize/spans/columns.py +13 -0
- arize/spans/conversion.py +60 -37
- arize/spans/validation/__init__.py +1 -0
- arize/spans/validation/annotations/__init__.py +1 -0
- arize/spans/validation/annotations/annotations_validation.py +6 -4
- arize/spans/validation/annotations/dataframe_form_validation.py +13 -11
- arize/spans/validation/annotations/value_validation.py +35 -11
- arize/spans/validation/common/__init__.py +1 -0
- arize/spans/validation/common/argument_validation.py +33 -8
- arize/spans/validation/common/dataframe_form_validation.py +35 -9
- arize/spans/validation/common/errors.py +211 -11
- arize/spans/validation/common/value_validation.py +81 -14
- arize/spans/validation/evals/__init__.py +1 -0
- arize/spans/validation/evals/dataframe_form_validation.py +28 -8
- arize/spans/validation/evals/evals_validation.py +34 -4
- arize/spans/validation/evals/value_validation.py +26 -3
- arize/spans/validation/metadata/__init__.py +1 -1
- arize/spans/validation/metadata/argument_validation.py +14 -5
- arize/spans/validation/metadata/dataframe_form_validation.py +26 -10
- arize/spans/validation/metadata/value_validation.py +24 -10
- arize/spans/validation/spans/__init__.py +1 -0
- arize/spans/validation/spans/dataframe_form_validation.py +35 -14
- arize/spans/validation/spans/spans_validation.py +35 -4
- arize/spans/validation/spans/value_validation.py +78 -8
- arize/utils/__init__.py +1 -0
- arize/utils/arrow.py +31 -15
- arize/utils/cache.py +34 -6
- arize/utils/dataframe.py +20 -3
- arize/utils/online_tasks/__init__.py +2 -0
- arize/utils/online_tasks/dataframe_preprocessor.py +58 -47
- arize/utils/openinference_conversion.py +44 -5
- arize/utils/proto.py +10 -0
- arize/utils/size.py +5 -3
- arize/utils/types.py +105 -0
- arize/version.py +3 -1
- {arize-8.0.0a22.dist-info → arize-8.0.0b0.dist-info}/METADATA +13 -6
- arize-8.0.0b0.dist-info/RECORD +175 -0
- {arize-8.0.0a22.dist-info → arize-8.0.0b0.dist-info}/WHEEL +1 -1
- arize-8.0.0b0.dist-info/licenses/LICENSE +176 -0
- arize-8.0.0b0.dist-info/licenses/NOTICE +13 -0
- arize/_generated/protocol/flight/export_pb2.py +0 -61
- arize/_generated/protocol/flight/ingest_pb2.py +0 -365
- arize/models/__init__.py +0 -0
- arize/models/batch_validation/__init__.py +0 -0
- arize/models/bounded_executor.py +0 -34
- arize/models/surrogate_explainer/__init__.py +0 -0
- arize-8.0.0a22.dist-info/RECORD +0 -146
- arize-8.0.0a22.dist-info/licenses/LICENSE.md +0 -12
arize/datasets/client.py
CHANGED
|
@@ -1,18 +1,20 @@
|
|
|
1
|
+
"""Client implementation for managing datasets in the Arize platform."""
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
5
|
import logging
|
|
4
6
|
import time
|
|
5
7
|
import uuid
|
|
6
|
-
from typing import
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
7
9
|
|
|
8
10
|
import pandas as pd
|
|
9
11
|
import pyarrow as pa
|
|
10
12
|
|
|
11
13
|
from arize._flight.client import ArizeFlightClient
|
|
12
14
|
from arize._generated.api_client import models
|
|
13
|
-
from arize.config import SDKConfiguration
|
|
14
15
|
from arize.datasets.validation import validate_dataset_df
|
|
15
16
|
from arize.exceptions.base import INVALID_ARROW_CONVERSION_MSG
|
|
17
|
+
from arize.pre_releases import ReleaseStage, prerelease_endpoint
|
|
16
18
|
from arize.utils.cache import cache_resource, load_cached_resource
|
|
17
19
|
from arize.utils.openinference_conversion import (
|
|
18
20
|
convert_boolean_columns_to_str,
|
|
@@ -21,40 +23,121 @@ from arize.utils.openinference_conversion import (
|
|
|
21
23
|
)
|
|
22
24
|
from arize.utils.size import get_payload_size_mb
|
|
23
25
|
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from arize.config import SDKConfiguration
|
|
28
|
+
|
|
24
29
|
logger = logging.getLogger(__name__)
|
|
25
30
|
|
|
26
31
|
|
|
27
32
|
class DatasetsClient:
|
|
28
|
-
|
|
33
|
+
"""Client for managing datasets including creation, retrieval, and example management.
|
|
34
|
+
|
|
35
|
+
This class is primarily intended for internal use within the SDK. Users are
|
|
36
|
+
highly encouraged to access resource-specific functionality via
|
|
37
|
+
:class:`arize.ArizeClient`.
|
|
38
|
+
|
|
39
|
+
The datasets client is a thin wrapper around the generated REST API client,
|
|
40
|
+
using the shared generated API client owned by
|
|
41
|
+
:class:`arize.config.SDKConfiguration`.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, *, sdk_config: SDKConfiguration) -> None:
|
|
45
|
+
"""
|
|
46
|
+
Args:
|
|
47
|
+
sdk_config: Resolved SDK configuration.
|
|
48
|
+
""" # noqa: D205, D212
|
|
29
49
|
self._sdk_config = sdk_config
|
|
30
50
|
|
|
31
|
-
# Import at runtime so it
|
|
51
|
+
# Import at runtime so it's still lazy and extras-gated by the parent
|
|
32
52
|
from arize._generated import api_client as gen
|
|
33
53
|
|
|
34
54
|
# Use the shared generated client from the config
|
|
35
55
|
self._api = gen.DatasetsApi(self._sdk_config.get_generated_client())
|
|
36
56
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
self
|
|
40
|
-
|
|
57
|
+
@prerelease_endpoint(key="datasets.list", stage=ReleaseStage.BETA)
|
|
58
|
+
def list(
|
|
59
|
+
self,
|
|
60
|
+
*,
|
|
61
|
+
space_id: str | None = None,
|
|
62
|
+
limit: int = 100,
|
|
63
|
+
cursor: str | None = None,
|
|
64
|
+
) -> models.DatasetsList200Response:
|
|
65
|
+
"""List datasets the user has access to.
|
|
66
|
+
|
|
67
|
+
Datasets are returned in descending creation order (most recently created
|
|
68
|
+
first). Dataset versions are not included in this response; use `get()` to
|
|
69
|
+
retrieve a dataset along with its versions.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
space_id: Optional space ID to scope results to a single space.
|
|
73
|
+
limit: Maximum number of datasets to return. The server enforces an
|
|
74
|
+
upper bound.
|
|
75
|
+
cursor: Opaque pagination cursor returned from a previous response.
|
|
41
76
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
self.list_examples = self._list_examples
|
|
77
|
+
Returns:
|
|
78
|
+
A response object with the datasets and pagination information.
|
|
45
79
|
|
|
46
|
-
|
|
80
|
+
Raises:
|
|
81
|
+
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
82
|
+
returns an error response (e.g. 401/403/429).
|
|
83
|
+
"""
|
|
84
|
+
return self._api.datasets_list(
|
|
85
|
+
space_id=space_id,
|
|
86
|
+
limit=limit,
|
|
87
|
+
cursor=cursor,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
@prerelease_endpoint(key="datasets.create", stage=ReleaseStage.BETA)
|
|
91
|
+
def create(
|
|
47
92
|
self,
|
|
48
93
|
*,
|
|
49
94
|
name: str,
|
|
50
95
|
space_id: str,
|
|
51
|
-
examples:
|
|
96
|
+
examples: list[dict[str, object]] | pd.DataFrame,
|
|
52
97
|
force_http: bool = False,
|
|
53
|
-
):
|
|
54
|
-
|
|
98
|
+
) -> models.Dataset:
|
|
99
|
+
"""Create a dataset with JSON examples.
|
|
100
|
+
|
|
101
|
+
Empty datasets are not allowed.
|
|
102
|
+
|
|
103
|
+
Payload notes (server-enforced):
|
|
104
|
+
- `name` must be unique within the given `space_id`.
|
|
105
|
+
- Each example may contain arbitrary user-defined fields.
|
|
106
|
+
- Do not include system-managed fields on create: `id`, `created_at`,
|
|
107
|
+
`updated_at` (requests containing these fields will be rejected).
|
|
108
|
+
- Each example must contain at least one property (i.e. `{}` is invalid).
|
|
109
|
+
|
|
110
|
+
Transport selection:
|
|
111
|
+
- If the payload is below the configured REST payload threshold (or
|
|
112
|
+
`force_http=True`), this method uploads via REST.
|
|
113
|
+
- Otherwise, it attempts a more efficient upload path via gRPC + Flight.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
name: Dataset name (must be unique within the target space).
|
|
117
|
+
space_id: Space ID to create the dataset in.
|
|
118
|
+
examples: Dataset examples either as:
|
|
119
|
+
- a list of JSON-like dicts, or
|
|
120
|
+
- a pandas DataFrame (will be converted to records for REST).
|
|
121
|
+
force_http: If True, force REST upload even if the payload exceeds the
|
|
122
|
+
configured REST payload threshold.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
The created dataset object as returned by the API.
|
|
126
|
+
|
|
127
|
+
Raises:
|
|
128
|
+
TypeError: If `examples` is not a list of dicts or a pandas DataFrame.
|
|
129
|
+
RuntimeError: If the Flight upload path is selected and the Flight request
|
|
130
|
+
fails.
|
|
131
|
+
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
132
|
+
returns an error response (e.g. 400/401/403/409/429).
|
|
133
|
+
"""
|
|
134
|
+
if not isinstance(examples, list | pd.DataFrame):
|
|
55
135
|
raise TypeError(
|
|
56
136
|
"Examples must be a list of dicts or a pandas DataFrame"
|
|
57
137
|
)
|
|
138
|
+
if len(examples) == 0:
|
|
139
|
+
raise ValueError("Cannot create an empty dataset")
|
|
140
|
+
|
|
58
141
|
below_threshold = (
|
|
59
142
|
get_payload_size_mb(examples)
|
|
60
143
|
<= self._sdk_config.max_http_payload_size_mb
|
|
@@ -63,14 +146,14 @@ class DatasetsClient:
|
|
|
63
146
|
from arize._generated import api_client as gen
|
|
64
147
|
|
|
65
148
|
data = (
|
|
66
|
-
examples.to_dict(orient="records")
|
|
149
|
+
examples.to_dict(orient="records") # type: ignore
|
|
67
150
|
if isinstance(examples, pd.DataFrame)
|
|
68
151
|
else examples
|
|
69
152
|
)
|
|
70
153
|
|
|
71
154
|
body = gen.DatasetsCreateRequest(
|
|
72
155
|
name=name,
|
|
73
|
-
|
|
156
|
+
space_id=space_id,
|
|
74
157
|
examples=data,
|
|
75
158
|
)
|
|
76
159
|
return self._api.datasets_create(datasets_create_request=body)
|
|
@@ -93,76 +176,83 @@ class DatasetsClient:
|
|
|
93
176
|
examples=data,
|
|
94
177
|
)
|
|
95
178
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
space_id: str,
|
|
100
|
-
examples: pd.DataFrame,
|
|
101
|
-
):
|
|
102
|
-
data = examples.copy()
|
|
103
|
-
# Convert datetime columns to int64 (ms since epoch)
|
|
104
|
-
data = convert_datetime_columns_to_int(data)
|
|
105
|
-
data = convert_boolean_columns_to_str(data)
|
|
106
|
-
data = _set_default_columns_for_dataset(data)
|
|
107
|
-
data = convert_default_columns_to_json_str(data)
|
|
179
|
+
@prerelease_endpoint(key="datasets.get", stage=ReleaseStage.BETA)
|
|
180
|
+
def get(self, *, dataset_id: str) -> models.Dataset:
|
|
181
|
+
"""Get a dataset by ID.
|
|
108
182
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
183
|
+
The returned dataset includes its dataset versions (sorted by creation time,
|
|
184
|
+
most recent first). Dataset examples are not included; use `list_examples()`
|
|
185
|
+
to retrieve examples.
|
|
112
186
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
logger.debug("Converting data to Arrow format")
|
|
116
|
-
pa_table = pa.Table.from_pandas(data, preserve_index=False)
|
|
117
|
-
except pa.ArrowInvalid as e:
|
|
118
|
-
logger.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
|
|
119
|
-
raise pa.ArrowInvalid(
|
|
120
|
-
f"Error converting to Arrow format: {str(e)}"
|
|
121
|
-
) from e
|
|
122
|
-
except Exception as e:
|
|
123
|
-
logger.error(f"Unexpected error creating Arrow table: {str(e)}")
|
|
124
|
-
raise
|
|
187
|
+
Args:
|
|
188
|
+
dataset_id: Dataset ID to retrieve.
|
|
125
189
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
api_key=self._sdk_config.api_key,
|
|
129
|
-
host=self._sdk_config.flight_server_host,
|
|
130
|
-
port=self._sdk_config.flight_server_port,
|
|
131
|
-
scheme=self._sdk_config.flight_scheme,
|
|
132
|
-
request_verify=self._sdk_config.request_verify,
|
|
133
|
-
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
134
|
-
) as flight_client:
|
|
135
|
-
try:
|
|
136
|
-
response = flight_client.create_dataset(
|
|
137
|
-
space_id=space_id,
|
|
138
|
-
dataset_name=name,
|
|
139
|
-
pa_table=pa_table,
|
|
140
|
-
)
|
|
141
|
-
except Exception as e:
|
|
142
|
-
msg = f"Error during update request: {str(e)}"
|
|
143
|
-
logger.error(msg)
|
|
144
|
-
raise RuntimeError(msg) from e
|
|
145
|
-
if response is None:
|
|
146
|
-
# This should not happen with proper Flight client implementation,
|
|
147
|
-
# but we handle it defensively
|
|
148
|
-
msg = "No response received from flight server during update"
|
|
149
|
-
logger.error(msg)
|
|
150
|
-
raise RuntimeError(msg)
|
|
151
|
-
# The response from flightserver is the dataset ID. To return the dataset
|
|
152
|
-
# object we make a GET query
|
|
153
|
-
dataset = self.get(dataset_id=response)
|
|
154
|
-
return dataset
|
|
190
|
+
Returns:
|
|
191
|
+
The dataset object.
|
|
155
192
|
|
|
156
|
-
|
|
193
|
+
Raises:
|
|
194
|
+
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
195
|
+
returns an error response (e.g. 401/403/404/429).
|
|
196
|
+
"""
|
|
197
|
+
return self._api.datasets_get(dataset_id=dataset_id)
|
|
198
|
+
|
|
199
|
+
@prerelease_endpoint(key="datasets.delete", stage=ReleaseStage.BETA)
|
|
200
|
+
def delete(self, *, dataset_id: str) -> None:
|
|
201
|
+
"""Delete a dataset by ID.
|
|
202
|
+
|
|
203
|
+
This operation is irreversible.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
dataset_id: Dataset ID to delete.
|
|
207
|
+
|
|
208
|
+
Returns: This method returns None on success (common empty 204 response)
|
|
209
|
+
|
|
210
|
+
Raises:
|
|
211
|
+
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
212
|
+
returns an error response (e.g. 401/403/404/429).
|
|
213
|
+
"""
|
|
214
|
+
return self._api.datasets_delete(dataset_id=dataset_id)
|
|
215
|
+
|
|
216
|
+
@prerelease_endpoint(key="datasets.list_examples", stage=ReleaseStage.BETA)
|
|
217
|
+
def list_examples(
|
|
157
218
|
self,
|
|
158
219
|
*,
|
|
159
220
|
dataset_id: str,
|
|
160
221
|
dataset_version_id: str = "",
|
|
161
222
|
limit: int = 100,
|
|
162
223
|
all: bool = False,
|
|
163
|
-
):
|
|
224
|
+
) -> models.DatasetsExamplesList200Response:
|
|
225
|
+
"""List examples for a dataset (optionally for a specific version).
|
|
226
|
+
|
|
227
|
+
If `dataset_version_id` is not provided (empty string), the server selects
|
|
228
|
+
the latest dataset version.
|
|
229
|
+
|
|
230
|
+
Pagination notes:
|
|
231
|
+
- The response includes `pagination` for forward compatibility.
|
|
232
|
+
- Cursor pagination may not be fully implemented by the server yet.
|
|
233
|
+
- If `all=True`, this method retrieves all examples via the Flight path,
|
|
234
|
+
and returns them in a single response with `has_more=False`.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
dataset_id: Dataset ID to list examples for.
|
|
238
|
+
dataset_version_id: Dataset version ID. If empty, the latest version is
|
|
239
|
+
selected.
|
|
240
|
+
limit: Maximum number of examples to return when `all=False`. The server
|
|
241
|
+
enforces an upper bound.
|
|
242
|
+
all: If True, fetch all examples (ignores `limit`) via Flight and return a
|
|
243
|
+
single response.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
A response object containing `examples` and `pagination` metadata.
|
|
247
|
+
|
|
248
|
+
Raises:
|
|
249
|
+
RuntimeError: If the Flight request fails or returns no response when
|
|
250
|
+
`all=True`.
|
|
251
|
+
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
252
|
+
returns an error response when `all=False` (e.g. 401/403/404/429).
|
|
253
|
+
"""
|
|
164
254
|
if not all:
|
|
165
|
-
return self._api.
|
|
255
|
+
return self._api.datasets_examples_list(
|
|
166
256
|
dataset_id=dataset_id,
|
|
167
257
|
dataset_version_id=dataset_version_id,
|
|
168
258
|
limit=limit,
|
|
@@ -184,14 +274,17 @@ class DatasetsClient:
|
|
|
184
274
|
resource_updated_at=dataset_updated_at,
|
|
185
275
|
)
|
|
186
276
|
if dataset_df is not None:
|
|
187
|
-
return models.
|
|
188
|
-
examples=dataset_df.to_dict(orient="records")
|
|
277
|
+
return models.DatasetsExamplesList200Response(
|
|
278
|
+
examples=dataset_df.to_dict(orient="records"), # type: ignore
|
|
279
|
+
pagination=models.PaginationMetadata(
|
|
280
|
+
has_more=False, # Note that all=True
|
|
281
|
+
),
|
|
189
282
|
)
|
|
190
283
|
|
|
191
284
|
with ArizeFlightClient(
|
|
192
285
|
api_key=self._sdk_config.api_key,
|
|
193
|
-
host=self._sdk_config.
|
|
194
|
-
port=self._sdk_config.
|
|
286
|
+
host=self._sdk_config.flight_host,
|
|
287
|
+
port=self._sdk_config.flight_port,
|
|
195
288
|
scheme=self._sdk_config.flight_scheme,
|
|
196
289
|
request_verify=self._sdk_config.request_verify,
|
|
197
290
|
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
@@ -203,8 +296,8 @@ class DatasetsClient:
|
|
|
203
296
|
dataset_version_id=dataset_version_id,
|
|
204
297
|
)
|
|
205
298
|
except Exception as e:
|
|
206
|
-
msg = f"Error during request: {
|
|
207
|
-
logger.
|
|
299
|
+
msg = f"Error during request: {e!s}"
|
|
300
|
+
logger.exception(msg)
|
|
208
301
|
raise RuntimeError(msg) from e
|
|
209
302
|
if dataset_df is None:
|
|
210
303
|
# This should not happen with proper Flight client implementation,
|
|
@@ -222,12 +315,139 @@ class DatasetsClient:
|
|
|
222
315
|
resource_data=dataset_df,
|
|
223
316
|
)
|
|
224
317
|
|
|
225
|
-
return models.
|
|
226
|
-
examples=dataset_df.to_dict(orient="records")
|
|
318
|
+
return models.DatasetsExamplesList200Response(
|
|
319
|
+
examples=dataset_df.to_dict(orient="records"), # type: ignore
|
|
320
|
+
pagination=models.PaginationMetadata(
|
|
321
|
+
has_more=False, # Note that all=True
|
|
322
|
+
),
|
|
227
323
|
)
|
|
228
324
|
|
|
325
|
+
# TODO(Kiko): Needs flightserver support
|
|
326
|
+
@prerelease_endpoint(
|
|
327
|
+
key="datasets.append_examples", stage=ReleaseStage.BETA
|
|
328
|
+
)
|
|
329
|
+
def append_examples(
|
|
330
|
+
self,
|
|
331
|
+
*,
|
|
332
|
+
dataset_id: str,
|
|
333
|
+
dataset_version_id: str = "",
|
|
334
|
+
examples: list[dict[str, object]] | pd.DataFrame,
|
|
335
|
+
) -> models.Dataset:
|
|
336
|
+
"""Append new examples to an existing dataset.
|
|
337
|
+
|
|
338
|
+
This method adds examples to an existing dataset version. If
|
|
339
|
+
`dataset_version_id` is not provided (empty string), the server appends
|
|
340
|
+
the examples to the latest dataset version.
|
|
341
|
+
|
|
342
|
+
The inserted examples are assigned system-generated IDs by the server.
|
|
343
|
+
|
|
344
|
+
Payload requirements (server-enforced):
|
|
345
|
+
- Each example may contain arbitrary user-defined fields.
|
|
346
|
+
- Do not include system-managed fields on input: `id`, `created_at`,
|
|
347
|
+
`updated_at` (requests containing these fields will be rejected).
|
|
348
|
+
- Each example must contain at least one property (i.e. empty
|
|
349
|
+
examples are not invalid).
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
dataset_id: Dataset ID to append examples to.
|
|
353
|
+
dataset_version_id: Optional dataset version ID to append examples to. If empty,
|
|
354
|
+
the latest dataset version is selected.
|
|
355
|
+
examples: Examples to append, provided as either:
|
|
356
|
+
- a list of JSON-like dicts, or
|
|
357
|
+
- a pandas DataFrame (converted to records before upload).
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
The updated dataset object. To see the examples, use `list_examples()`.
|
|
361
|
+
|
|
362
|
+
Raises:
|
|
363
|
+
AssertionError: If `examples` is not a list of dicts or a pandas
|
|
364
|
+
DataFrame.
|
|
365
|
+
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
366
|
+
returns an error response (e.g. 400/401/403/404/429).
|
|
367
|
+
"""
|
|
368
|
+
from arize._generated import api_client as gen
|
|
369
|
+
|
|
370
|
+
if not isinstance(examples, list | pd.DataFrame):
|
|
371
|
+
raise TypeError(
|
|
372
|
+
"Examples must be a list of dicts or a pandas DataFrame"
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
data = (
|
|
376
|
+
examples.to_dict(orient="records") # type: ignore
|
|
377
|
+
if isinstance(examples, pd.DataFrame)
|
|
378
|
+
else examples
|
|
379
|
+
)
|
|
380
|
+
body = gen.DatasetsExamplesInsertRequest(examples=data)
|
|
381
|
+
|
|
382
|
+
return self._api.datasets_examples_insert(
|
|
383
|
+
dataset_id=dataset_id,
|
|
384
|
+
dataset_version_id=dataset_version_id,
|
|
385
|
+
datasets_examples_insert_request=body,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
def _create_dataset_via_flight(
|
|
389
|
+
self,
|
|
390
|
+
name: str,
|
|
391
|
+
space_id: str,
|
|
392
|
+
examples: pd.DataFrame,
|
|
393
|
+
) -> object:
|
|
394
|
+
"""Internal method to create a dataset using Flight protocol for large example sets."""
|
|
395
|
+
data = examples.copy()
|
|
396
|
+
# Convert datetime columns to int64 (ms since epoch)
|
|
397
|
+
data = convert_datetime_columns_to_int(data)
|
|
398
|
+
data = convert_boolean_columns_to_str(data)
|
|
399
|
+
data = _set_default_columns_for_dataset(data)
|
|
400
|
+
data = convert_default_columns_to_json_str(data)
|
|
401
|
+
|
|
402
|
+
validation_errors = validate_dataset_df(data)
|
|
403
|
+
if validation_errors:
|
|
404
|
+
raise RuntimeError([e.error_message() for e in validation_errors])
|
|
405
|
+
|
|
406
|
+
# Convert to Arrow table
|
|
407
|
+
try:
|
|
408
|
+
logger.debug("Converting data to Arrow format")
|
|
409
|
+
pa_table = pa.Table.from_pandas(data, preserve_index=False)
|
|
410
|
+
except pa.ArrowInvalid as e:
|
|
411
|
+
logger.exception(INVALID_ARROW_CONVERSION_MSG)
|
|
412
|
+
raise pa.ArrowInvalid(
|
|
413
|
+
f"Error converting to Arrow format: {e!s}"
|
|
414
|
+
) from e
|
|
415
|
+
except Exception:
|
|
416
|
+
logger.exception("Unexpected error creating Arrow table")
|
|
417
|
+
raise
|
|
418
|
+
|
|
419
|
+
response = None
|
|
420
|
+
with ArizeFlightClient(
|
|
421
|
+
api_key=self._sdk_config.api_key,
|
|
422
|
+
host=self._sdk_config.flight_host,
|
|
423
|
+
port=self._sdk_config.flight_port,
|
|
424
|
+
scheme=self._sdk_config.flight_scheme,
|
|
425
|
+
request_verify=self._sdk_config.request_verify,
|
|
426
|
+
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
427
|
+
) as flight_client:
|
|
428
|
+
try:
|
|
429
|
+
response = flight_client.create_dataset(
|
|
430
|
+
space_id=space_id,
|
|
431
|
+
dataset_name=name,
|
|
432
|
+
pa_table=pa_table,
|
|
433
|
+
)
|
|
434
|
+
except Exception as e:
|
|
435
|
+
msg = f"Error during create request: {e!s}"
|
|
436
|
+
logger.exception(msg)
|
|
437
|
+
raise RuntimeError(msg) from e
|
|
438
|
+
if response is None:
|
|
439
|
+
# This should not happen with proper Flight client implementation,
|
|
440
|
+
# but we handle it defensively
|
|
441
|
+
msg = "No response received from flight server during update"
|
|
442
|
+
logger.error(msg)
|
|
443
|
+
raise RuntimeError(msg)
|
|
444
|
+
# The response from flightserver is the dataset ID. To return the dataset
|
|
445
|
+
# object we make a GET query
|
|
446
|
+
return self.get(dataset_id=response)
|
|
447
|
+
|
|
229
448
|
|
|
230
449
|
def _set_default_columns_for_dataset(df: pd.DataFrame) -> pd.DataFrame:
|
|
450
|
+
"""Set default values for created_at and updated_at columns if missing or null."""
|
|
231
451
|
current_time = int(time.time() * 1000)
|
|
232
452
|
if "created_at" in df.columns:
|
|
233
453
|
if df["created_at"].isnull().values.any(): # type: ignore
|
arize/datasets/errors.py
CHANGED
|
@@ -1,21 +1,29 @@
|
|
|
1
|
+
"""Dataset-specific exception classes."""
|
|
2
|
+
|
|
1
3
|
from abc import ABC, abstractmethod
|
|
2
4
|
|
|
3
5
|
|
|
4
6
|
class DatasetError(Exception, ABC):
|
|
7
|
+
"""Base exception for dataset-related errors."""
|
|
8
|
+
|
|
5
9
|
def __str__(self) -> str:
|
|
10
|
+
"""Return a human-readable error message."""
|
|
6
11
|
return self.error_message()
|
|
7
12
|
|
|
8
13
|
@abstractmethod
|
|
9
14
|
def __repr__(self) -> str:
|
|
10
|
-
|
|
15
|
+
"""Return a string representation for debugging and logging."""
|
|
11
16
|
|
|
12
17
|
@abstractmethod
|
|
13
18
|
def error_message(self) -> str:
|
|
14
|
-
|
|
19
|
+
"""Return the error message for this exception."""
|
|
15
20
|
|
|
16
21
|
|
|
17
22
|
class InvalidSessionError(DatasetError):
|
|
23
|
+
"""Raised when credentials are not provided or invalid."""
|
|
24
|
+
|
|
18
25
|
def error_message(self) -> str:
|
|
26
|
+
"""Return the error message for this exception."""
|
|
19
27
|
return (
|
|
20
28
|
"Credentials not provided or invalid. Please pass in the correct api_key when "
|
|
21
29
|
"initiating a new ArizeExportClient. Alternatively, you can set up credentials "
|
|
@@ -23,39 +31,61 @@ class InvalidSessionError(DatasetError):
|
|
|
23
31
|
)
|
|
24
32
|
|
|
25
33
|
def __repr__(self) -> str:
|
|
34
|
+
"""Return a string representation for debugging and logging."""
|
|
26
35
|
return "InvalidSessionError()"
|
|
27
36
|
|
|
28
37
|
|
|
29
38
|
class InvalidConfigFileError(DatasetError):
|
|
39
|
+
"""Raised when configuration file is invalid or misconfigured."""
|
|
40
|
+
|
|
30
41
|
def error_message(self) -> str:
|
|
42
|
+
"""Return the error message for this exception."""
|
|
31
43
|
return "Invalid/Misconfigured Configuration File"
|
|
32
44
|
|
|
33
45
|
def __repr__(self) -> str:
|
|
46
|
+
"""Return a string representation for debugging and logging."""
|
|
34
47
|
return "InvalidConfigFileError()"
|
|
35
48
|
|
|
36
49
|
|
|
37
50
|
class IDColumnUniqueConstraintError(DatasetError):
|
|
51
|
+
"""Raised when id column contains duplicate values."""
|
|
52
|
+
|
|
38
53
|
def error_message(self) -> str:
|
|
54
|
+
"""Return the error message for this exception."""
|
|
39
55
|
return "'id' column must contain unique values"
|
|
40
56
|
|
|
41
57
|
def __repr__(self) -> str:
|
|
58
|
+
"""Return a string representation for debugging and logging."""
|
|
42
59
|
return "IDColumnUniqueConstraintError()"
|
|
43
60
|
|
|
44
61
|
|
|
45
62
|
class RequiredColumnsError(DatasetError):
|
|
63
|
+
"""Raised when required columns are missing from the dataset."""
|
|
64
|
+
|
|
46
65
|
def __init__(self, missing_columns: set) -> None:
|
|
66
|
+
"""Initialize the exception with missing columns context.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
missing_columns: Set of required columns that are missing.
|
|
70
|
+
"""
|
|
47
71
|
self.missing_columns = missing_columns
|
|
48
72
|
|
|
49
73
|
def error_message(self) -> str:
|
|
74
|
+
"""Return the error message for this exception."""
|
|
50
75
|
return f"Missing required columns: {self.missing_columns}"
|
|
51
76
|
|
|
52
77
|
def __repr__(self) -> str:
|
|
78
|
+
"""Return a string representation for debugging and logging."""
|
|
53
79
|
return f"RequiredColumnsError({self.missing_columns})"
|
|
54
80
|
|
|
55
81
|
|
|
56
82
|
class EmptyDatasetError(DatasetError):
|
|
83
|
+
"""Raised when dataset DataFrame has no rows."""
|
|
84
|
+
|
|
57
85
|
def error_message(self) -> str:
|
|
86
|
+
"""Return the error message for this exception."""
|
|
58
87
|
return "DataFrame must have at least one row in it."
|
|
59
88
|
|
|
60
89
|
def __repr__(self) -> str:
|
|
90
|
+
"""Return a string representation for debugging and logging."""
|
|
61
91
|
return "EmptyDatasetError()"
|
arize/datasets/validation.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
"""Dataset validation logic for structure and content checks."""
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
@@ -7,7 +7,17 @@ from arize.datasets import errors as err
|
|
|
7
7
|
|
|
8
8
|
def validate_dataset_df(
|
|
9
9
|
df: pd.DataFrame,
|
|
10
|
-
) ->
|
|
10
|
+
) -> list[err.DatasetError]:
|
|
11
|
+
"""Validate a dataset DataFrame for structural and content errors.
|
|
12
|
+
|
|
13
|
+
Checks for required columns, unique ID values, and non-empty data.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
df: The pandas DataFrame to validate.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
A list of DatasetError objects found during validation. Empty list if valid.
|
|
20
|
+
"""
|
|
11
21
|
## check all require columns are present
|
|
12
22
|
required_columns_errors = _check_required_columns(df)
|
|
13
23
|
if required_columns_errors:
|
|
@@ -19,14 +29,14 @@ def validate_dataset_df(
|
|
|
19
29
|
return id_column_unique_constraint_error
|
|
20
30
|
|
|
21
31
|
# check DataFrame has at least one row in it
|
|
22
|
-
|
|
23
|
-
if
|
|
24
|
-
return
|
|
32
|
+
empty_dataframe_error = _check_empty_dataframe(df)
|
|
33
|
+
if empty_dataframe_error:
|
|
34
|
+
return empty_dataframe_error
|
|
25
35
|
|
|
26
36
|
return []
|
|
27
37
|
|
|
28
38
|
|
|
29
|
-
def _check_required_columns(df: pd.DataFrame) ->
|
|
39
|
+
def _check_required_columns(df: pd.DataFrame) -> list[err.DatasetError]:
|
|
30
40
|
required_columns = ["id", "created_at", "updated_at"]
|
|
31
41
|
missing_columns = set(required_columns) - set(df.columns)
|
|
32
42
|
if missing_columns:
|
|
@@ -34,13 +44,13 @@ def _check_required_columns(df: pd.DataFrame) -> List[err.DatasetError]:
|
|
|
34
44
|
return []
|
|
35
45
|
|
|
36
46
|
|
|
37
|
-
def _check_id_column_is_unique(df: pd.DataFrame) ->
|
|
47
|
+
def _check_id_column_is_unique(df: pd.DataFrame) -> list[err.DatasetError]:
|
|
38
48
|
if not df["id"].is_unique:
|
|
39
49
|
return [err.IDColumnUniqueConstraintError()]
|
|
40
50
|
return []
|
|
41
51
|
|
|
42
52
|
|
|
43
|
-
def _check_empty_dataframe(df: pd.DataFrame) ->
|
|
53
|
+
def _check_empty_dataframe(df: pd.DataFrame) -> list[err.DatasetError]:
|
|
44
54
|
if df.empty:
|
|
45
55
|
return [err.EmptyDatasetError()]
|
|
46
56
|
return []
|