arize 8.0.0a22__py3-none-any.whl → 8.0.0a23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. arize/__init__.py +17 -9
  2. arize/_exporter/client.py +55 -36
  3. arize/_exporter/parsers/tracing_data_parser.py +41 -30
  4. arize/_exporter/validation.py +3 -3
  5. arize/_flight/client.py +207 -76
  6. arize/_generated/api_client/__init__.py +30 -6
  7. arize/_generated/api_client/api/__init__.py +1 -0
  8. arize/_generated/api_client/api/datasets_api.py +864 -190
  9. arize/_generated/api_client/api/experiments_api.py +167 -131
  10. arize/_generated/api_client/api/projects_api.py +1197 -0
  11. arize/_generated/api_client/api_client.py +2 -2
  12. arize/_generated/api_client/configuration.py +42 -34
  13. arize/_generated/api_client/exceptions.py +2 -2
  14. arize/_generated/api_client/models/__init__.py +15 -4
  15. arize/_generated/api_client/models/dataset.py +10 -10
  16. arize/_generated/api_client/models/dataset_example.py +111 -0
  17. arize/_generated/api_client/models/dataset_example_update.py +100 -0
  18. arize/_generated/api_client/models/dataset_version.py +13 -13
  19. arize/_generated/api_client/models/datasets_create_request.py +16 -8
  20. arize/_generated/api_client/models/datasets_examples_insert_request.py +100 -0
  21. arize/_generated/api_client/models/datasets_examples_list200_response.py +106 -0
  22. arize/_generated/api_client/models/datasets_examples_update_request.py +102 -0
  23. arize/_generated/api_client/models/datasets_list200_response.py +10 -4
  24. arize/_generated/api_client/models/experiment.py +14 -16
  25. arize/_generated/api_client/models/experiment_run.py +108 -0
  26. arize/_generated/api_client/models/experiment_run_create.py +102 -0
  27. arize/_generated/api_client/models/experiments_create_request.py +16 -10
  28. arize/_generated/api_client/models/experiments_list200_response.py +10 -4
  29. arize/_generated/api_client/models/experiments_runs_list200_response.py +19 -5
  30. arize/_generated/api_client/models/{error.py → pagination_metadata.py} +13 -11
  31. arize/_generated/api_client/models/primitive_value.py +172 -0
  32. arize/_generated/api_client/models/problem.py +100 -0
  33. arize/_generated/api_client/models/project.py +99 -0
  34. arize/_generated/api_client/models/{datasets_list_examples200_response.py → projects_create_request.py} +13 -11
  35. arize/_generated/api_client/models/projects_list200_response.py +106 -0
  36. arize/_generated/api_client/rest.py +2 -2
  37. arize/_generated/api_client/test/test_dataset.py +4 -2
  38. arize/_generated/api_client/test/test_dataset_example.py +56 -0
  39. arize/_generated/api_client/test/test_dataset_example_update.py +52 -0
  40. arize/_generated/api_client/test/test_dataset_version.py +7 -2
  41. arize/_generated/api_client/test/test_datasets_api.py +27 -13
  42. arize/_generated/api_client/test/test_datasets_create_request.py +8 -4
  43. arize/_generated/api_client/test/{test_datasets_list_examples200_response.py → test_datasets_examples_insert_request.py} +19 -15
  44. arize/_generated/api_client/test/test_datasets_examples_list200_response.py +66 -0
  45. arize/_generated/api_client/test/test_datasets_examples_update_request.py +61 -0
  46. arize/_generated/api_client/test/test_datasets_list200_response.py +9 -3
  47. arize/_generated/api_client/test/test_experiment.py +2 -4
  48. arize/_generated/api_client/test/test_experiment_run.py +56 -0
  49. arize/_generated/api_client/test/test_experiment_run_create.py +54 -0
  50. arize/_generated/api_client/test/test_experiments_api.py +6 -6
  51. arize/_generated/api_client/test/test_experiments_create_request.py +9 -6
  52. arize/_generated/api_client/test/test_experiments_list200_response.py +9 -5
  53. arize/_generated/api_client/test/test_experiments_runs_list200_response.py +15 -5
  54. arize/_generated/api_client/test/test_pagination_metadata.py +53 -0
  55. arize/_generated/api_client/test/{test_error.py → test_primitive_value.py} +13 -14
  56. arize/_generated/api_client/test/test_problem.py +57 -0
  57. arize/_generated/api_client/test/test_project.py +58 -0
  58. arize/_generated/api_client/test/test_projects_api.py +59 -0
  59. arize/_generated/api_client/test/test_projects_create_request.py +54 -0
  60. arize/_generated/api_client/test/test_projects_list200_response.py +70 -0
  61. arize/_generated/api_client_README.md +43 -29
  62. arize/_generated/protocol/flight/flight_pb2.py +400 -0
  63. arize/_lazy.py +27 -19
  64. arize/client.py +268 -55
  65. arize/config.py +365 -116
  66. arize/constants/__init__.py +1 -0
  67. arize/constants/config.py +11 -4
  68. arize/constants/ml.py +6 -4
  69. arize/constants/openinference.py +2 -0
  70. arize/constants/pyarrow.py +2 -0
  71. arize/constants/spans.py +3 -1
  72. arize/datasets/__init__.py +1 -0
  73. arize/datasets/client.py +299 -84
  74. arize/datasets/errors.py +32 -2
  75. arize/datasets/validation.py +18 -8
  76. arize/embeddings/__init__.py +2 -0
  77. arize/embeddings/auto_generator.py +23 -19
  78. arize/embeddings/base_generators.py +89 -36
  79. arize/embeddings/constants.py +2 -0
  80. arize/embeddings/cv_generators.py +26 -4
  81. arize/embeddings/errors.py +27 -5
  82. arize/embeddings/nlp_generators.py +31 -12
  83. arize/embeddings/tabular_generators.py +32 -20
  84. arize/embeddings/usecases.py +12 -2
  85. arize/exceptions/__init__.py +1 -0
  86. arize/exceptions/auth.py +11 -1
  87. arize/exceptions/base.py +29 -4
  88. arize/exceptions/models.py +21 -2
  89. arize/exceptions/parameters.py +31 -0
  90. arize/exceptions/spaces.py +12 -1
  91. arize/exceptions/types.py +86 -7
  92. arize/exceptions/values.py +220 -20
  93. arize/experiments/__init__.py +1 -0
  94. arize/experiments/client.py +389 -285
  95. arize/experiments/evaluators/__init__.py +1 -0
  96. arize/experiments/evaluators/base.py +74 -41
  97. arize/experiments/evaluators/exceptions.py +6 -3
  98. arize/experiments/evaluators/executors.py +121 -73
  99. arize/experiments/evaluators/rate_limiters.py +106 -57
  100. arize/experiments/evaluators/types.py +34 -7
  101. arize/experiments/evaluators/utils.py +65 -27
  102. arize/experiments/functions.py +103 -101
  103. arize/experiments/tracing.py +52 -44
  104. arize/experiments/types.py +56 -31
  105. arize/logging.py +54 -22
  106. arize/models/__init__.py +1 -0
  107. arize/models/batch_validation/__init__.py +1 -0
  108. arize/models/batch_validation/errors.py +543 -65
  109. arize/models/batch_validation/validator.py +339 -300
  110. arize/models/bounded_executor.py +20 -7
  111. arize/models/casting.py +75 -29
  112. arize/models/client.py +326 -107
  113. arize/models/proto.py +95 -40
  114. arize/models/stream_validation.py +42 -14
  115. arize/models/surrogate_explainer/__init__.py +1 -0
  116. arize/models/surrogate_explainer/mimic.py +24 -13
  117. arize/pre_releases.py +43 -0
  118. arize/projects/__init__.py +1 -0
  119. arize/projects/client.py +129 -0
  120. arize/regions.py +40 -0
  121. arize/spans/__init__.py +1 -0
  122. arize/spans/client.py +130 -106
  123. arize/spans/columns.py +13 -0
  124. arize/spans/conversion.py +54 -38
  125. arize/spans/validation/__init__.py +1 -0
  126. arize/spans/validation/annotations/__init__.py +1 -0
  127. arize/spans/validation/annotations/annotations_validation.py +6 -4
  128. arize/spans/validation/annotations/dataframe_form_validation.py +13 -11
  129. arize/spans/validation/annotations/value_validation.py +35 -11
  130. arize/spans/validation/common/__init__.py +1 -0
  131. arize/spans/validation/common/argument_validation.py +33 -8
  132. arize/spans/validation/common/dataframe_form_validation.py +35 -9
  133. arize/spans/validation/common/errors.py +211 -11
  134. arize/spans/validation/common/value_validation.py +80 -13
  135. arize/spans/validation/evals/__init__.py +1 -0
  136. arize/spans/validation/evals/dataframe_form_validation.py +28 -8
  137. arize/spans/validation/evals/evals_validation.py +34 -4
  138. arize/spans/validation/evals/value_validation.py +26 -3
  139. arize/spans/validation/metadata/__init__.py +1 -1
  140. arize/spans/validation/metadata/argument_validation.py +14 -5
  141. arize/spans/validation/metadata/dataframe_form_validation.py +26 -10
  142. arize/spans/validation/metadata/value_validation.py +24 -10
  143. arize/spans/validation/spans/__init__.py +1 -0
  144. arize/spans/validation/spans/dataframe_form_validation.py +34 -13
  145. arize/spans/validation/spans/spans_validation.py +35 -4
  146. arize/spans/validation/spans/value_validation.py +76 -7
  147. arize/types.py +293 -157
  148. arize/utils/__init__.py +1 -0
  149. arize/utils/arrow.py +31 -15
  150. arize/utils/cache.py +34 -6
  151. arize/utils/dataframe.py +19 -2
  152. arize/utils/online_tasks/__init__.py +2 -0
  153. arize/utils/online_tasks/dataframe_preprocessor.py +53 -41
  154. arize/utils/openinference_conversion.py +44 -5
  155. arize/utils/proto.py +10 -0
  156. arize/utils/size.py +5 -3
  157. arize/version.py +3 -1
  158. {arize-8.0.0a22.dist-info → arize-8.0.0a23.dist-info}/METADATA +4 -3
  159. arize-8.0.0a23.dist-info/RECORD +174 -0
  160. {arize-8.0.0a22.dist-info → arize-8.0.0a23.dist-info}/WHEEL +1 -1
  161. arize-8.0.0a23.dist-info/licenses/LICENSE +176 -0
  162. arize-8.0.0a23.dist-info/licenses/NOTICE +13 -0
  163. arize/_generated/protocol/flight/export_pb2.py +0 -61
  164. arize/_generated/protocol/flight/ingest_pb2.py +0 -365
  165. arize-8.0.0a22.dist-info/RECORD +0 -146
  166. arize-8.0.0a22.dist-info/licenses/LICENSE.md +0 -12
arize/constants/ml.py CHANGED
@@ -1,3 +1,5 @@
1
+ """Machine learning constants and validation limits."""
2
+
1
3
  import json
2
4
  from pathlib import Path
3
5
 
@@ -30,7 +32,7 @@ MAX_PROMPT_TEMPLATE_VERSION_LENGTH_TRUNCATION = 50
30
32
  MAX_NUMBER_OF_EMBEDDINGS = 30
31
33
  MAX_EMBEDDING_DIMENSIONALITY = 20_000
32
34
  # # The maximum number of classes for multi class
33
- MAX_NUMBER_OF_MULTI_CLASS_CLASSES = 300
35
+ MAX_NUMBER_OF_MULTI_CLASS_CLASSES = 500
34
36
  MAX_MULTI_CLASS_NAME_LENGTH = 100
35
37
  # The maximum number of references in embedding similarity search params
36
38
  MAX_NUMBER_OF_SIMILARITY_REFERENCES = 10
@@ -40,9 +42,9 @@ MAX_NUMBER_OF_SIMILARITY_REFERENCES = 10
40
42
  # GENERATED_LLM_PARAMS_JSON_COL = "arize_generated_llm_params_json"
41
43
  #
42
44
  # # reserved columns for LLM run metadata
43
- LLM_RUN_METADATA_TOTAL_TOKEN_COUNT_TAG_NAME = "total_token_count"
44
- LLM_RUN_METADATA_PROMPT_TOKEN_COUNT_TAG_NAME = "prompt_token_count"
45
- LLM_RUN_METADATA_RESPONSE_TOKEN_COUNT_TAG_NAME = "response_token_count"
45
+ LLM_RUN_METADATA_TOTAL_TOKEN_COUNT_TAG_NAME = "total_token_count" # noqa: S105
46
+ LLM_RUN_METADATA_PROMPT_TOKEN_COUNT_TAG_NAME = "prompt_token_count" # noqa: S105
47
+ LLM_RUN_METADATA_RESPONSE_TOKEN_COUNT_TAG_NAME = "response_token_count" # noqa: S105
46
48
  LLM_RUN_METADATA_RESPONSE_LATENCY_MS_TAG_NAME = "response_latency_ms"
47
49
  #
48
50
  # all reserved tags
@@ -1,3 +1,5 @@
1
+ """OpenInference semantic convention constants and attribute definitions."""
2
+
1
3
  import openinference.semconv.trace as oinf
2
4
 
3
5
  OPEN_INFERENCE_JSON_STR_TYPES = frozenset(
@@ -1 +1,3 @@
1
+ """PyArrow-related constants for data processing."""
2
+
1
3
  MAX_CHUNKSIZE = 100_000
arize/constants/spans.py CHANGED
@@ -1,4 +1,6 @@
1
- # The defualt format used to parse datetime objects from strings
1
+ """Span-related constants and validation limits for tracing."""
2
+
3
+ # The default format used to parse datetime objects from strings
2
4
  DEFAULT_DATETIME_FMT = "%Y-%m-%dT%H:%M:%S.%f+00:00"
3
5
  # Minumum/Maximum number of characters for span/trace/parent ids in spans
4
6
  SPAN_ID_MIN_STR_LENGTH = 12
@@ -0,0 +1 @@
1
+ """Dataset management and validation utilities for the Arize SDK."""
arize/datasets/client.py CHANGED
@@ -1,18 +1,20 @@
1
+ """Client implementation for managing datasets in the Arize platform."""
2
+
1
3
  from __future__ import annotations
2
4
 
3
5
  import logging
4
6
  import time
5
7
  import uuid
6
- from typing import Any, Dict, List
8
+ from typing import TYPE_CHECKING
7
9
 
8
10
  import pandas as pd
9
11
  import pyarrow as pa
10
12
 
11
13
  from arize._flight.client import ArizeFlightClient
12
14
  from arize._generated.api_client import models
13
- from arize.config import SDKConfiguration
14
15
  from arize.datasets.validation import validate_dataset_df
15
16
  from arize.exceptions.base import INVALID_ARROW_CONVERSION_MSG
17
+ from arize.pre_releases import ReleaseStage, prerelease_endpoint
16
18
  from arize.utils.cache import cache_resource, load_cached_resource
17
19
  from arize.utils.openinference_conversion import (
18
20
  convert_boolean_columns_to_str,
@@ -21,40 +23,116 @@ from arize.utils.openinference_conversion import (
21
23
  )
22
24
  from arize.utils.size import get_payload_size_mb
23
25
 
26
+ if TYPE_CHECKING:
27
+ from arize.config import SDKConfiguration
28
+
24
29
  logger = logging.getLogger(__name__)
25
30
 
26
31
 
27
32
  class DatasetsClient:
28
- def __init__(self, *, sdk_config: SDKConfiguration):
33
+ """Client for managing datasets including creation, retrieval, and example management."""
34
+
35
+ def __init__(self, *, sdk_config: SDKConfiguration) -> None:
36
+ """Create a datasets sub-client.
37
+
38
+ The datasets client is a thin wrapper around the generated REST API client,
39
+ using the shared generated API client owned by `SDKConfiguration`.
40
+
41
+ Args:
42
+ sdk_config: Resolved SDK configuration.
43
+ """
29
44
  self._sdk_config = sdk_config
30
45
 
31
- # Import at runtime so its still lazy and extras-gated by the parent
46
+ # Import at runtime so it's still lazy and extras-gated by the parent
32
47
  from arize._generated import api_client as gen
33
48
 
34
49
  # Use the shared generated client from the config
35
50
  self._api = gen.DatasetsApi(self._sdk_config.get_generated_client())
36
51
 
37
- # Forward methods to preserve exact runtime signatures/docs
38
- self.list = self._api.datasets_list
39
- self.get = self._api.datasets_get
40
- self.delete = self._api.datasets_delete
52
+ @prerelease_endpoint(key="datasets.list", stage=ReleaseStage.BETA)
53
+ def list(
54
+ self,
55
+ *,
56
+ space_id: str | None = None,
57
+ limit: int = 100,
58
+ cursor: str | None = None,
59
+ ) -> models.DatasetsList200Response:
60
+ """List datasets the user has access to.
61
+
62
+ Datasets are returned in descending creation order (most recently created
63
+ first). Dataset versions are not included in this response; use `get()` to
64
+ retrieve a dataset along with its versions.
65
+
66
+ Args:
67
+ space_id: Optional space ID to scope results to a single space.
68
+ limit: Maximum number of datasets to return. The server enforces an
69
+ upper bound.
70
+ cursor: Opaque pagination cursor returned from a previous response.
41
71
 
42
- # Custom methods
43
- self.create = self._create_dataset
44
- self.list_examples = self._list_examples
72
+ Returns:
73
+ A response object with the datasets and pagination information.
45
74
 
46
- def _create_dataset(
75
+ Raises:
76
+ arize._generated.api_client.exceptions.ApiException: If the REST API
77
+ returns an error response (e.g. 401/403/429).
78
+ """
79
+ return self._api.datasets_list(
80
+ space_id=space_id,
81
+ limit=limit,
82
+ cursor=cursor,
83
+ )
84
+
85
+ @prerelease_endpoint(key="datasets.create", stage=ReleaseStage.BETA)
86
+ def create(
47
87
  self,
48
88
  *,
49
89
  name: str,
50
90
  space_id: str,
51
- examples: List[Dict[str, Any]] | pd.DataFrame,
91
+ examples: list[dict[str, object]] | pd.DataFrame,
52
92
  force_http: bool = False,
53
- ):
54
- if not isinstance(examples, (list, pd.DataFrame)):
93
+ ) -> models.Dataset:
94
+ """Create a dataset with JSON examples.
95
+
96
+ Empty datasets are not allowed.
97
+
98
+ Payload notes (server-enforced):
99
+ - `name` must be unique within the given `space_id`.
100
+ - Each example may contain arbitrary user-defined fields.
101
+ - Do not include system-managed fields on create: `id`, `created_at`,
102
+ `updated_at` (requests containing these fields will be rejected).
103
+ - Each example must contain at least one property (i.e. `{}` is invalid).
104
+
105
+ Transport selection:
106
+ - If the payload is below the configured REST payload threshold (or
107
+ `force_http=True`), this method uploads via REST.
108
+ - Otherwise, it attempts a more efficient upload path via gRPC + Flight.
109
+
110
+ Args:
111
+ name: Dataset name (must be unique within the target space).
112
+ space_id: Space ID to create the dataset in.
113
+ examples: Dataset examples either as:
114
+ - a list of JSON-like dicts, or
115
+ - a pandas DataFrame (will be converted to records for REST).
116
+ force_http: If True, force REST upload even if the payload exceeds the
117
+ configured REST payload threshold.
118
+
119
+ Returns:
120
+ The created dataset object as returned by the API.
121
+
122
+ Raises:
123
+ TypeError: If `examples` is not a list of dicts or a pandas DataFrame.
124
+ RuntimeError: If the Flight upload path is selected and the Flight request
125
+ fails.
126
+ arize._generated.api_client.exceptions.ApiException: If the REST API
127
+ returns an error response (e.g. 400/401/403/409/429).
128
+ """
129
+ if not isinstance(examples, list | pd.DataFrame):
55
130
  raise TypeError(
56
131
  "Examples must be a list of dicts or a pandas DataFrame"
57
132
  )
133
+ if len(examples) == 0:
134
+ raise ValueError("Cannot create an empty dataset")
135
+
58
136
  below_threshold = (
59
137
  get_payload_size_mb(examples)
60
138
  <= self._sdk_config.max_http_payload_size_mb
@@ -63,14 +141,14 @@ class DatasetsClient:
63
141
  from arize._generated import api_client as gen
64
142
 
65
143
  data = (
66
- examples.to_dict(orient="records")
144
+ examples.to_dict(orient="records") # type: ignore
67
145
  if isinstance(examples, pd.DataFrame)
68
146
  else examples
69
147
  )
70
148
 
71
149
  body = gen.DatasetsCreateRequest(
72
150
  name=name,
73
- spaceId=space_id,
151
+ space_id=space_id,
74
152
  examples=data,
75
153
  )
76
154
  return self._api.datasets_create(datasets_create_request=body)
@@ -93,76 +171,83 @@ class DatasetsClient:
93
171
  examples=data,
94
172
  )
95
173
 
96
- def _create_dataset_via_flight(
97
- self,
98
- name: str,
99
- space_id: str,
100
- examples: pd.DataFrame,
101
- ):
102
- data = examples.copy()
103
- # Convert datetime columns to int64 (ms since epoch)
104
- data = convert_datetime_columns_to_int(data)
105
- data = convert_boolean_columns_to_str(data)
106
- data = _set_default_columns_for_dataset(data)
107
- data = convert_default_columns_to_json_str(data)
174
+ @prerelease_endpoint(key="datasets.get", stage=ReleaseStage.BETA)
175
+ def get(self, *, dataset_id: str) -> models.Dataset:
176
+ """Get a dataset by ID.
108
177
 
109
- validation_errors = validate_dataset_df(data)
110
- if validation_errors:
111
- raise RuntimeError([e.error_message() for e in validation_errors])
178
+ The returned dataset includes its dataset versions (sorted by creation time,
179
+ most recent first). Dataset examples are not included; use `list_examples()`
180
+ to retrieve examples.
112
181
 
113
- # Convert to Arrow table
114
- try:
115
- logger.debug("Converting data to Arrow format")
116
- pa_table = pa.Table.from_pandas(data, preserve_index=False)
117
- except pa.ArrowInvalid as e:
118
- logger.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
119
- raise pa.ArrowInvalid(
120
- f"Error converting to Arrow format: {str(e)}"
121
- ) from e
122
- except Exception as e:
123
- logger.error(f"Unexpected error creating Arrow table: {str(e)}")
124
- raise
182
+ Args:
183
+ dataset_id: Dataset ID to retrieve.
125
184
 
126
- response = None
127
- with ArizeFlightClient(
128
- api_key=self._sdk_config.api_key,
129
- host=self._sdk_config.flight_server_host,
130
- port=self._sdk_config.flight_server_port,
131
- scheme=self._sdk_config.flight_scheme,
132
- request_verify=self._sdk_config.request_verify,
133
- max_chunksize=self._sdk_config.pyarrow_max_chunksize,
134
- ) as flight_client:
135
- try:
136
- response = flight_client.create_dataset(
137
- space_id=space_id,
138
- dataset_name=name,
139
- pa_table=pa_table,
140
- )
141
- except Exception as e:
142
- msg = f"Error during update request: {str(e)}"
143
- logger.error(msg)
144
- raise RuntimeError(msg) from e
145
- if response is None:
146
- # This should not happen with proper Flight client implementation,
147
- # but we handle it defensively
148
- msg = "No response received from flight server during update"
149
- logger.error(msg)
150
- raise RuntimeError(msg)
151
- # The response from flightserver is the dataset ID. To return the dataset
152
- # object we make a GET query
153
- dataset = self.get(dataset_id=response)
154
- return dataset
185
+ Returns:
186
+ The dataset object.
155
187
 
156
- def _list_examples(
188
+ Raises:
189
+ arize._generated.api_client.exceptions.ApiException: If the REST API
190
+ returns an error response (e.g. 401/403/404/429).
191
+ """
192
+ return self._api.datasets_get(dataset_id=dataset_id)
193
+
194
+ @prerelease_endpoint(key="datasets.delete", stage=ReleaseStage.BETA)
195
+ def delete(self, *, dataset_id: str) -> None:
196
+ """Delete a dataset by ID.
197
+
198
+ This operation is irreversible.
199
+
200
+ Args:
201
+ dataset_id: Dataset ID to delete.
202
+
203
+ Returns: This method returns None on success (common empty 204 response)
204
+
205
+ Raises:
206
+ arize._generated.api_client.exceptions.ApiException: If the REST API
207
+ returns an error response (e.g. 401/403/404/429).
208
+ """
209
+ return self._api.datasets_delete(dataset_id=dataset_id)
210
+
211
+ @prerelease_endpoint(key="datasets.list_examples", stage=ReleaseStage.BETA)
212
+ def list_examples(
157
213
  self,
158
214
  *,
159
215
  dataset_id: str,
160
216
  dataset_version_id: str = "",
161
217
  limit: int = 100,
162
218
  all: bool = False,
163
- ):
219
+ ) -> models.DatasetsExamplesList200Response:
220
+ """List examples for a dataset (optionally for a specific version).
221
+
222
+ If `dataset_version_id` is not provided (empty string), the server selects
223
+ the latest dataset version.
224
+
225
+ Pagination notes:
226
+ - The response includes `pagination` for forward compatibility.
227
+ - Cursor pagination may not be fully implemented by the server yet.
228
+ - If `all=True`, this method retrieves all examples via the Flight path,
229
+ and returns them in a single response with `has_more=False`.
230
+
231
+ Args:
232
+ dataset_id: Dataset ID to list examples for.
233
+ dataset_version_id: Dataset version ID. If empty, the latest version is
234
+ selected.
235
+ limit: Maximum number of examples to return when `all=False`. The server
236
+ enforces an upper bound.
237
+ all: If True, fetch all examples (ignores `limit`) via Flight and return a
238
+ single response.
239
+
240
+ Returns:
241
+ A response object containing `examples` and `pagination` metadata.
242
+
243
+ Raises:
244
+ RuntimeError: If the Flight request fails or returns no response when
245
+ `all=True`.
246
+ arize._generated.api_client.exceptions.ApiException: If the REST API
247
+ returns an error response when `all=False` (e.g. 401/403/404/429).
248
+ """
164
249
  if not all:
165
- return self._api.datasets_list_examples(
250
+ return self._api.datasets_examples_list(
166
251
  dataset_id=dataset_id,
167
252
  dataset_version_id=dataset_version_id,
168
253
  limit=limit,
@@ -184,14 +269,17 @@ class DatasetsClient:
184
269
  resource_updated_at=dataset_updated_at,
185
270
  )
186
271
  if dataset_df is not None:
187
- return models.DatasetsListExamples200Response(
188
- examples=dataset_df.to_dict(orient="records")
272
+ return models.DatasetsExamplesList200Response(
273
+ examples=dataset_df.to_dict(orient="records"), # type: ignore
274
+ pagination=models.PaginationMetadata(
275
+ has_more=False, # Note that all=True
276
+ ),
189
277
  )
190
278
 
191
279
  with ArizeFlightClient(
192
280
  api_key=self._sdk_config.api_key,
193
- host=self._sdk_config.flight_server_host,
194
- port=self._sdk_config.flight_server_port,
281
+ host=self._sdk_config.flight_host,
282
+ port=self._sdk_config.flight_port,
195
283
  scheme=self._sdk_config.flight_scheme,
196
284
  request_verify=self._sdk_config.request_verify,
197
285
  max_chunksize=self._sdk_config.pyarrow_max_chunksize,
@@ -203,8 +291,8 @@ class DatasetsClient:
203
291
  dataset_version_id=dataset_version_id,
204
292
  )
205
293
  except Exception as e:
206
- msg = f"Error during request: {str(e)}"
207
- logger.error(msg)
294
+ msg = f"Error during request: {e!s}"
295
+ logger.exception(msg)
208
296
  raise RuntimeError(msg) from e
209
297
  if dataset_df is None:
210
298
  # This should not happen with proper Flight client implementation,
@@ -222,12 +310,139 @@ class DatasetsClient:
222
310
  resource_data=dataset_df,
223
311
  )
224
312
 
225
- return models.DatasetsListExamples200Response(
226
- examples=dataset_df.to_dict(orient="records")
313
+ return models.DatasetsExamplesList200Response(
314
+ examples=dataset_df.to_dict(orient="records"), # type: ignore
315
+ pagination=models.PaginationMetadata(
316
+ has_more=False, # Note that all=True
317
+ ),
227
318
  )
228
319
 
320
+ # TODO(Kiko): Needs flightserver support
321
+ @prerelease_endpoint(
322
+ key="datasets.append_examples", stage=ReleaseStage.BETA
323
+ )
324
+ def append_examples(
325
+ self,
326
+ *,
327
+ dataset_id: str,
328
+ dataset_version_id: str = "",
329
+ examples: list[dict[str, object]] | pd.DataFrame,
330
+ ) -> models.Dataset:
331
+ """Append new examples to an existing dataset.
332
+
333
+ This method adds examples to an existing dataset version. If
334
+ `dataset_version_id` is not provided (empty string), the server appends
335
+ the examples to the latest dataset version.
336
+
337
+ The inserted examples are assigned system-generated IDs by the server.
338
+
339
+ Payload requirements (server-enforced):
340
+ - Each example may contain arbitrary user-defined fields.
341
+ - Do not include system-managed fields on input: `id`, `created_at`,
342
+ `updated_at` (requests containing these fields will be rejected).
343
+ - Each example must contain at least one property (i.e. empty
344
+ examples are not invalid).
345
+
346
+ Args:
347
+ dataset_id: Dataset ID to append examples to.
348
+ dataset_version_id: Optional dataset version ID to append examples to. If empty,
349
+ the latest dataset version is selected.
350
+ examples: Examples to append, provided as either:
351
+ - a list of JSON-like dicts, or
352
+ - a pandas DataFrame (converted to records before upload).
353
+
354
+ Returns:
355
+ The updated dataset object. To see the examples, use `list_examples()`.
356
+
357
+ Raises:
358
+ AssertionError: If `examples` is not a list of dicts or a pandas
359
+ DataFrame.
360
+ arize._generated.api_client.exceptions.ApiException: If the REST API
361
+ returns an error response (e.g. 400/401/403/404/429).
362
+ """
363
+ from arize._generated import api_client as gen
364
+
365
+ if not isinstance(examples, list | pd.DataFrame):
366
+ raise TypeError(
367
+ "Examples must be a list of dicts or a pandas DataFrame"
368
+ )
369
+
370
+ data = (
371
+ examples.to_dict(orient="records") # type: ignore
372
+ if isinstance(examples, pd.DataFrame)
373
+ else examples
374
+ )
375
+ body = gen.DatasetsExamplesInsertRequest(examples=data)
376
+
377
+ return self._api.datasets_examples_insert(
378
+ dataset_id=dataset_id,
379
+ dataset_version_id=dataset_version_id,
380
+ datasets_examples_insert_request=body,
381
+ )
382
+
383
+ def _create_dataset_via_flight(
384
+ self,
385
+ name: str,
386
+ space_id: str,
387
+ examples: pd.DataFrame,
388
+ ) -> object:
389
+ """Internal method to create a dataset using Flight protocol for large example sets."""
390
+ data = examples.copy()
391
+ # Convert datetime columns to int64 (ms since epoch)
392
+ data = convert_datetime_columns_to_int(data)
393
+ data = convert_boolean_columns_to_str(data)
394
+ data = _set_default_columns_for_dataset(data)
395
+ data = convert_default_columns_to_json_str(data)
396
+
397
+ validation_errors = validate_dataset_df(data)
398
+ if validation_errors:
399
+ raise RuntimeError([e.error_message() for e in validation_errors])
400
+
401
+ # Convert to Arrow table
402
+ try:
403
+ logger.debug("Converting data to Arrow format")
404
+ pa_table = pa.Table.from_pandas(data, preserve_index=False)
405
+ except pa.ArrowInvalid as e:
406
+ logger.exception(INVALID_ARROW_CONVERSION_MSG)
407
+ raise pa.ArrowInvalid(
408
+ f"Error converting to Arrow format: {e!s}"
409
+ ) from e
410
+ except Exception:
411
+ logger.exception("Unexpected error creating Arrow table")
412
+ raise
413
+
414
+ response = None
415
+ with ArizeFlightClient(
416
+ api_key=self._sdk_config.api_key,
417
+ host=self._sdk_config.flight_host,
418
+ port=self._sdk_config.flight_port,
419
+ scheme=self._sdk_config.flight_scheme,
420
+ request_verify=self._sdk_config.request_verify,
421
+ max_chunksize=self._sdk_config.pyarrow_max_chunksize,
422
+ ) as flight_client:
423
+ try:
424
+ response = flight_client.create_dataset(
425
+ space_id=space_id,
426
+ dataset_name=name,
427
+ pa_table=pa_table,
428
+ )
429
+ except Exception as e:
430
+ msg = f"Error during create request: {e!s}"
431
+ logger.exception(msg)
432
+ raise RuntimeError(msg) from e
433
+ if response is None:
434
+ # This should not happen with proper Flight client implementation,
435
+ # but we handle it defensively
436
+ msg = "No response received from flight server during update"
437
+ logger.error(msg)
438
+ raise RuntimeError(msg)
439
+ # The response from flightserver is the dataset ID. To return the dataset
440
+ # object we make a GET query
441
+ return self.get(dataset_id=response)
442
+
229
443
 
230
444
  def _set_default_columns_for_dataset(df: pd.DataFrame) -> pd.DataFrame:
445
+ """Set default values for created_at and updated_at columns if missing or null."""
231
446
  current_time = int(time.time() * 1000)
232
447
  if "created_at" in df.columns:
233
448
  if df["created_at"].isnull().values.any(): # type: ignore
arize/datasets/errors.py CHANGED
@@ -1,21 +1,29 @@
1
+ """Dataset-specific exception classes."""
2
+
1
3
  from abc import ABC, abstractmethod
2
4
 
3
5
 
4
6
  class DatasetError(Exception, ABC):
7
+ """Base exception for dataset-related errors."""
8
+
5
9
  def __str__(self) -> str:
10
+ """Return a human-readable error message."""
6
11
  return self.error_message()
7
12
 
8
13
  @abstractmethod
9
14
  def __repr__(self) -> str:
10
- pass
15
+ """Return a string representation for debugging and logging."""
11
16
 
12
17
  @abstractmethod
13
18
  def error_message(self) -> str:
14
- pass
19
+ """Return the error message for this exception."""
15
20
 
16
21
 
17
22
  class InvalidSessionError(DatasetError):
23
+ """Raised when credentials are not provided or invalid."""
24
+
18
25
  def error_message(self) -> str:
26
+ """Return the error message for this exception."""
19
27
  return (
20
28
  "Credentials not provided or invalid. Please pass in the correct api_key when "
21
29
  "initiating a new ArizeExportClient. Alternatively, you can set up credentials "
@@ -23,39 +31,61 @@ class InvalidSessionError(DatasetError):
23
31
  )
24
32
 
25
33
  def __repr__(self) -> str:
34
+ """Return a string representation for debugging and logging."""
26
35
  return "InvalidSessionError()"
27
36
 
28
37
 
29
38
  class InvalidConfigFileError(DatasetError):
39
+ """Raised when configuration file is invalid or misconfigured."""
40
+
30
41
  def error_message(self) -> str:
42
+ """Return the error message for this exception."""
31
43
  return "Invalid/Misconfigured Configuration File"
32
44
 
33
45
  def __repr__(self) -> str:
46
+ """Return a string representation for debugging and logging."""
34
47
  return "InvalidConfigFileError()"
35
48
 
36
49
 
37
50
  class IDColumnUniqueConstraintError(DatasetError):
51
+ """Raised when id column contains duplicate values."""
52
+
38
53
  def error_message(self) -> str:
54
+ """Return the error message for this exception."""
39
55
  return "'id' column must contain unique values"
40
56
 
41
57
  def __repr__(self) -> str:
58
+ """Return a string representation for debugging and logging."""
42
59
  return "IDColumnUniqueConstraintError()"
43
60
 
44
61
 
45
62
  class RequiredColumnsError(DatasetError):
63
+ """Raised when required columns are missing from the dataset."""
64
+
46
65
  def __init__(self, missing_columns: set) -> None:
66
+ """Initialize the exception with missing columns context.
67
+
68
+ Args:
69
+ missing_columns: Set of required columns that are missing.
70
+ """
47
71
  self.missing_columns = missing_columns
48
72
 
49
73
  def error_message(self) -> str:
74
+ """Return the error message for this exception."""
50
75
  return f"Missing required columns: {self.missing_columns}"
51
76
 
52
77
  def __repr__(self) -> str:
78
+ """Return a string representation for debugging and logging."""
53
79
  return f"RequiredColumnsError({self.missing_columns})"
54
80
 
55
81
 
56
82
  class EmptyDatasetError(DatasetError):
83
+ """Raised when dataset DataFrame has no rows."""
84
+
57
85
  def error_message(self) -> str:
86
+ """Return the error message for this exception."""
58
87
  return "DataFrame must have at least one row in it."
59
88
 
60
89
  def __repr__(self) -> str:
90
+ """Return a string representation for debugging and logging."""
61
91
  return "EmptyDatasetError()"