arize 8.0.0a16__py3-none-any.whl → 8.0.0a17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +1 -0
- arize/_flight/client.py +32 -1
- arize/client.py +8 -0
- arize/config.py +14 -0
- arize/constants/config.py +4 -0
- arize/datasets/client.py +77 -56
- arize/experiments/client.py +118 -17
- arize/utils/cache.py +68 -0
- arize/version.py +1 -1
- {arize-8.0.0a16.dist-info → arize-8.0.0a17.dist-info}/METADATA +217 -14
- {arize-8.0.0a16.dist-info → arize-8.0.0a17.dist-info}/RECORD +13 -12
- {arize-8.0.0a16.dist-info → arize-8.0.0a17.dist-info}/WHEEL +0 -0
- {arize-8.0.0a16.dist-info → arize-8.0.0a17.dist-info}/licenses/LICENSE.md +0 -0
arize/__init__.py
CHANGED
|
@@ -87,3 +87,4 @@ def make_to_df(field_name: str):
|
|
|
87
87
|
models.DatasetsList200Response.to_df = make_to_df("datasets") # type: ignore[attr-defined]
|
|
88
88
|
models.DatasetsListExamples200Response.to_df = make_to_df("examples") # type: ignore[attr-defined]
|
|
89
89
|
models.ExperimentsList200Response.to_df = make_to_df("experiments") # type: ignore[attr-defined]
|
|
90
|
+
models.ExperimentsRunsList200Response.to_df = make_to_df("experiment_runs") # type: ignore[attr-defined]
|
arize/_flight/client.py
CHANGED
|
@@ -25,6 +25,7 @@ from arize.utils.proto import get_pb_schema_tracing
|
|
|
25
25
|
from arize.version import __version__
|
|
26
26
|
|
|
27
27
|
if TYPE_CHECKING:
|
|
28
|
+
import pandas as pd
|
|
28
29
|
import pyarrow as pa
|
|
29
30
|
|
|
30
31
|
|
|
@@ -260,7 +261,7 @@ class ArizeFlightClient:
|
|
|
260
261
|
space_id: str,
|
|
261
262
|
dataset_id: str,
|
|
262
263
|
dataset_version_id: str | None = None,
|
|
263
|
-
):
|
|
264
|
+
) -> pd.DataFrame:
|
|
264
265
|
# TODO(Kiko): Space ID should not be needed,
|
|
265
266
|
# should work on server tech debt to remove this
|
|
266
267
|
doget_request = flight_ing_pb2.DoGetRequest(
|
|
@@ -283,6 +284,36 @@ class ArizeFlightClient:
|
|
|
283
284
|
logger.exception(f"Failed to get dataset id={dataset_id}")
|
|
284
285
|
raise RuntimeError(f"Failed to get dataset id={dataset_id}") from e
|
|
285
286
|
|
|
287
|
+
# ---------- experiment methods ----------
|
|
288
|
+
|
|
289
|
+
def get_experiment_runs(
|
|
290
|
+
self,
|
|
291
|
+
space_id: str,
|
|
292
|
+
experiment_id: str,
|
|
293
|
+
) -> pd.DataFrame:
|
|
294
|
+
# TODO(Kiko): Space ID should not be needed,
|
|
295
|
+
# should work on server tech debt to remove this
|
|
296
|
+
doget_request = flight_ing_pb2.DoGetRequest(
|
|
297
|
+
get_experiment=flight_ing_pb2.GetExperimentRequest(
|
|
298
|
+
space_id=space_id,
|
|
299
|
+
experiment_id=experiment_id,
|
|
300
|
+
)
|
|
301
|
+
)
|
|
302
|
+
descriptor = flight.Ticket(
|
|
303
|
+
json_format.MessageToJson(doget_request).encode("utf-8")
|
|
304
|
+
)
|
|
305
|
+
try:
|
|
306
|
+
reader = self.do_get(descriptor, options=self.call_options)
|
|
307
|
+
# read all data into pandas dataframe
|
|
308
|
+
df = reader.read_all().to_pandas()
|
|
309
|
+
df = convert_json_str_to_dict(df)
|
|
310
|
+
return df
|
|
311
|
+
except Exception as e:
|
|
312
|
+
logger.exception(f"Failed to get experiment id={experiment_id}")
|
|
313
|
+
raise RuntimeError(
|
|
314
|
+
f"Failed to get experiment id={experiment_id}"
|
|
315
|
+
) from e
|
|
316
|
+
|
|
286
317
|
def init_experiment(
|
|
287
318
|
self,
|
|
288
319
|
space_id: str,
|
arize/client.py
CHANGED
|
@@ -12,6 +12,14 @@ if TYPE_CHECKING:
|
|
|
12
12
|
from arize.spans.client import SpansClient
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
# TODO(Kiko): models need to follow resource first pattern
|
|
16
|
+
# - models.DatasetsList200Response
|
|
17
|
+
# - models.DatasetsListExamples200Response
|
|
18
|
+
# - models.ExperimentsList200Response
|
|
19
|
+
# - models.ExperimentsRunsList200Response
|
|
20
|
+
# TODO(Kiko): Root client should have option to clear caches
|
|
21
|
+
# TODO(Kiko): Document caching behavior
|
|
22
|
+
# TODO(Kiko): Force keyword arguments
|
|
15
23
|
# TODO(Kiko): Protobuf versioning is too old
|
|
16
24
|
# TODO(Kiko): Make sure the client has same options as SDKConfiguration
|
|
17
25
|
# TODO(Kiko): It does not make any sense to require space ID in run_experiment, dataset ID should suffice
|
arize/config.py
CHANGED
|
@@ -7,6 +7,8 @@ from typing import Any, Dict
|
|
|
7
7
|
|
|
8
8
|
from arize.constants.config import (
|
|
9
9
|
DEFAULT_API_HOST,
|
|
10
|
+
DEFAULT_ARIZE_DIRECTORY,
|
|
11
|
+
DEFAULT_ENABLE_CACHING,
|
|
10
12
|
DEFAULT_FLIGHT_HOST,
|
|
11
13
|
DEFAULT_FLIGHT_PORT,
|
|
12
14
|
DEFAULT_FLIGHT_TRANSPORT_SCHEME,
|
|
@@ -19,6 +21,8 @@ from arize.constants.config import (
|
|
|
19
21
|
DEFAULT_STREAM_MAX_WORKERS,
|
|
20
22
|
ENV_API_HOST,
|
|
21
23
|
ENV_API_KEY,
|
|
24
|
+
ENV_ARIZE_DIRECTORY,
|
|
25
|
+
ENV_ENABLE_CACHING,
|
|
22
26
|
ENV_FLIGHT_HOST,
|
|
23
27
|
ENV_FLIGHT_PORT,
|
|
24
28
|
ENV_FLIGHT_TRANSPORT_SCHEME,
|
|
@@ -116,6 +120,14 @@ def _max_http_payload_size_mb_factory() -> float:
|
|
|
116
120
|
)
|
|
117
121
|
|
|
118
122
|
|
|
123
|
+
def _arize_dir_factory() -> str:
|
|
124
|
+
return os.getenv(ENV_ARIZE_DIRECTORY, DEFAULT_ARIZE_DIRECTORY)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _enable_cache_factory() -> bool:
|
|
128
|
+
return _parse_bool(os.getenv(ENV_ENABLE_CACHING, DEFAULT_ENABLE_CACHING))
|
|
129
|
+
|
|
130
|
+
|
|
119
131
|
def _mask_secret(secret: str, N: int = 4) -> str:
|
|
120
132
|
"""Show first N chars then '***'; empty string if empty."""
|
|
121
133
|
return f"{secret[:N]}***"
|
|
@@ -147,6 +159,8 @@ class SDKConfiguration:
|
|
|
147
159
|
max_http_payload_size_mb: float = field(
|
|
148
160
|
default_factory=_max_http_payload_size_mb_factory
|
|
149
161
|
)
|
|
162
|
+
arize_direcory: str = field(default_factory=_arize_dir_factory)
|
|
163
|
+
enable_caching: bool = field(default_factory=_enable_cache_factory)
|
|
150
164
|
|
|
151
165
|
# Private, excluded from comparisons & repr
|
|
152
166
|
_headers: Dict[str, str] = field(init=False, repr=False, compare=False)
|
arize/constants/config.py
CHANGED
|
@@ -11,6 +11,8 @@ ENV_PYARROW_MAX_CHUNKSIZE = "ARIZE_MAX_CHUNKSIZE"
|
|
|
11
11
|
ENV_REQUEST_VERIFY = "ARIZE_REQUEST_VERIFY"
|
|
12
12
|
ENV_INSECURE = "ARIZE_INSECURE"
|
|
13
13
|
ENV_MAX_HTTP_PAYLOAD_SIZE_MB = "ARIZE_MAX_HTTP_PAYLOAD_SIZE_MB"
|
|
14
|
+
ENV_ARIZE_DIRECTORY = "ARIZE_DIRECTORY"
|
|
15
|
+
ENV_ENABLE_CACHING = "ARIZE_ENABLE_CACHING"
|
|
14
16
|
|
|
15
17
|
# Server configuration default values
|
|
16
18
|
DEFAULT_API_HOST = "api.arize.com" # NOTE: Must not prefix with https://
|
|
@@ -22,6 +24,8 @@ DEFAULT_PYARROW_MAX_CHUNKSIZE = 10_000
|
|
|
22
24
|
DEFAULT_REQUEST_VERIFY = True
|
|
23
25
|
DEFAULT_INSECURE = False
|
|
24
26
|
DEFAULT_MAX_HTTP_PAYLOAD_SIZE_MB = 100
|
|
27
|
+
DEFAULT_ARIZE_DIRECTORY = "~/.arize"
|
|
28
|
+
DEFAULT_ENABLE_CACHING = True
|
|
25
29
|
|
|
26
30
|
# ML Streaming configuration
|
|
27
31
|
ENV_STREAM_MAX_WORKERS = "ARIZE_STREAM_MAX_WORKERS"
|
arize/datasets/client.py
CHANGED
|
@@ -13,6 +13,7 @@ from arize._generated.api_client import models
|
|
|
13
13
|
from arize.config import SDKConfiguration
|
|
14
14
|
from arize.datasets.validation import validate_dataset_df
|
|
15
15
|
from arize.exceptions.base import INVALID_ARROW_CONVERSION_MSG
|
|
16
|
+
from arize.utils.cache import cache_resource, load_cached_resource
|
|
16
17
|
from arize.utils.openinference_conversion import (
|
|
17
18
|
convert_boolean_columns_to_str,
|
|
18
19
|
convert_datetime_columns_to_int,
|
|
@@ -22,9 +23,6 @@ from arize.utils.size import get_payload_size_mb
|
|
|
22
23
|
|
|
23
24
|
logger = logging.getLogger(__name__)
|
|
24
25
|
|
|
25
|
-
# TODO(Kiko): Decide based on size of payload instead
|
|
26
|
-
REST_LIMIT_DATASET_EXAMPLES = 0
|
|
27
|
-
|
|
28
26
|
|
|
29
27
|
class DatasetsClient:
|
|
30
28
|
def __init__(self, sdk_config: SDKConfiguration):
|
|
@@ -42,57 +40,8 @@ class DatasetsClient:
|
|
|
42
40
|
self.delete = self._api.datasets_delete
|
|
43
41
|
|
|
44
42
|
# Custom methods
|
|
45
|
-
self.list_examples = self._list_examples
|
|
46
43
|
self.create = self._create_dataset
|
|
47
|
-
|
|
48
|
-
def _list_examples(
|
|
49
|
-
self,
|
|
50
|
-
dataset_id: str,
|
|
51
|
-
dataset_version_id: str = "",
|
|
52
|
-
limit: int = 100,
|
|
53
|
-
all: bool = False,
|
|
54
|
-
):
|
|
55
|
-
if not all:
|
|
56
|
-
return self._api.datasets_list_examples(
|
|
57
|
-
dataset_id=dataset_id,
|
|
58
|
-
dataset_version_id=dataset_version_id,
|
|
59
|
-
limit=limit,
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
# TODO(Kiko): Space ID should not be needed,
|
|
63
|
-
# should work on server tech debt to remove this
|
|
64
|
-
dataset = self.get(dataset_id=dataset_id)
|
|
65
|
-
space_id = dataset.space_id
|
|
66
|
-
|
|
67
|
-
with ArizeFlightClient(
|
|
68
|
-
api_key=self._sdk_config.api_key,
|
|
69
|
-
host=self._sdk_config.flight_server_host,
|
|
70
|
-
port=self._sdk_config.flight_server_port,
|
|
71
|
-
scheme=self._sdk_config.flight_scheme,
|
|
72
|
-
request_verify=self._sdk_config.request_verify,
|
|
73
|
-
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
74
|
-
) as flight_client:
|
|
75
|
-
try:
|
|
76
|
-
response = flight_client.get_dataset_examples(
|
|
77
|
-
space_id=space_id,
|
|
78
|
-
dataset_id=dataset_id,
|
|
79
|
-
dataset_version_id=dataset_version_id,
|
|
80
|
-
)
|
|
81
|
-
except Exception as e:
|
|
82
|
-
msg = f"Error during request: {str(e)}"
|
|
83
|
-
logger.error(msg)
|
|
84
|
-
raise RuntimeError(msg) from e
|
|
85
|
-
if response is None:
|
|
86
|
-
# This should not happen with proper Flight client implementation,
|
|
87
|
-
# but we handle it defensively
|
|
88
|
-
msg = "No response received from flight server during request"
|
|
89
|
-
logger.error(msg)
|
|
90
|
-
raise RuntimeError(msg)
|
|
91
|
-
# The response from flightserver is the dataset ID. To return the dataset
|
|
92
|
-
# object we make a GET query
|
|
93
|
-
return models.DatasetsListExamples200Response(
|
|
94
|
-
examples=response.to_dict(orient="records")
|
|
95
|
-
)
|
|
44
|
+
self.list_examples = self._list_examples
|
|
96
45
|
|
|
97
46
|
def _create_dataset(
|
|
98
47
|
self,
|
|
@@ -203,23 +152,95 @@ class DatasetsClient:
|
|
|
203
152
|
dataset = self.get(dataset_id=response)
|
|
204
153
|
return dataset
|
|
205
154
|
|
|
155
|
+
def _list_examples(
|
|
156
|
+
self,
|
|
157
|
+
dataset_id: str,
|
|
158
|
+
dataset_version_id: str = "",
|
|
159
|
+
limit: int = 100,
|
|
160
|
+
all: bool = False,
|
|
161
|
+
):
|
|
162
|
+
if not all:
|
|
163
|
+
return self._api.datasets_list_examples(
|
|
164
|
+
dataset_id=dataset_id,
|
|
165
|
+
dataset_version_id=dataset_version_id,
|
|
166
|
+
limit=limit,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
dataset = self.get(dataset_id=dataset_id)
|
|
170
|
+
dataset_updated_at = getattr(dataset, "updated_at", None)
|
|
171
|
+
# TODO(Kiko): Space ID should not be needed,
|
|
172
|
+
# should work on server tech debt to remove this
|
|
173
|
+
space_id = dataset.space_id
|
|
174
|
+
|
|
175
|
+
dataset_df = None
|
|
176
|
+
# try to load dataset from cache
|
|
177
|
+
if self._sdk_config.enable_caching:
|
|
178
|
+
dataset_df = load_cached_resource(
|
|
179
|
+
cache_dir=self._sdk_config.arize_direcory,
|
|
180
|
+
resource="dataset",
|
|
181
|
+
resource_id=dataset_id,
|
|
182
|
+
resource_updated_at=dataset_updated_at,
|
|
183
|
+
)
|
|
184
|
+
if dataset_df is not None:
|
|
185
|
+
return models.DatasetsListExamples200Response(
|
|
186
|
+
examples=dataset_df.to_dict(orient="records")
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
with ArizeFlightClient(
|
|
190
|
+
api_key=self._sdk_config.api_key,
|
|
191
|
+
host=self._sdk_config.flight_server_host,
|
|
192
|
+
port=self._sdk_config.flight_server_port,
|
|
193
|
+
scheme=self._sdk_config.flight_scheme,
|
|
194
|
+
request_verify=self._sdk_config.request_verify,
|
|
195
|
+
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
196
|
+
) as flight_client:
|
|
197
|
+
try:
|
|
198
|
+
dataset_df = flight_client.get_dataset_examples(
|
|
199
|
+
space_id=space_id,
|
|
200
|
+
dataset_id=dataset_id,
|
|
201
|
+
dataset_version_id=dataset_version_id,
|
|
202
|
+
)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
msg = f"Error during request: {str(e)}"
|
|
205
|
+
logger.error(msg)
|
|
206
|
+
raise RuntimeError(msg) from e
|
|
207
|
+
if dataset_df is None:
|
|
208
|
+
# This should not happen with proper Flight client implementation,
|
|
209
|
+
# but we handle it defensively
|
|
210
|
+
msg = "No response received from flight server during request"
|
|
211
|
+
logger.error(msg)
|
|
212
|
+
raise RuntimeError(msg)
|
|
213
|
+
|
|
214
|
+
# cache dataset for future use
|
|
215
|
+
cache_resource(
|
|
216
|
+
cache_dir=self._sdk_config.arize_direcory,
|
|
217
|
+
resource="dataset",
|
|
218
|
+
resource_id=dataset_id,
|
|
219
|
+
resource_updated_at=dataset_updated_at,
|
|
220
|
+
resource_data=dataset_df,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
return models.DatasetsListExamples200Response(
|
|
224
|
+
examples=dataset_df.to_dict(orient="records")
|
|
225
|
+
)
|
|
226
|
+
|
|
206
227
|
|
|
207
228
|
def _set_default_columns_for_dataset(df: pd.DataFrame) -> pd.DataFrame:
|
|
208
229
|
current_time = int(time.time() * 1000)
|
|
209
230
|
if "created_at" in df.columns:
|
|
210
|
-
if df["created_at"].isnull().values.any():
|
|
231
|
+
if df["created_at"].isnull().values.any(): # type: ignore
|
|
211
232
|
df["created_at"].fillna(current_time, inplace=True)
|
|
212
233
|
else:
|
|
213
234
|
df["created_at"] = current_time
|
|
214
235
|
|
|
215
236
|
if "updated_at" in df.columns:
|
|
216
|
-
if df["updated_at"].isnull().values.any():
|
|
237
|
+
if df["updated_at"].isnull().values.any(): # type: ignore
|
|
217
238
|
df["updated_at"].fillna(current_time, inplace=True)
|
|
218
239
|
else:
|
|
219
240
|
df["updated_at"] = current_time
|
|
220
241
|
|
|
221
242
|
if "id" in df.columns:
|
|
222
|
-
if df["id"].isnull().values.any():
|
|
243
|
+
if df["id"].isnull().values.any(): # type: ignore
|
|
223
244
|
df["id"] = df["id"].apply(
|
|
224
245
|
lambda x: str(uuid.uuid4()) if pd.isnull(x) else x
|
|
225
246
|
)
|
arize/experiments/client.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import hashlib
|
|
3
4
|
import logging
|
|
4
5
|
from typing import TYPE_CHECKING, Any, Dict, List, Tuple
|
|
5
6
|
|
|
@@ -19,6 +20,7 @@ from opentelemetry.trace import Tracer
|
|
|
19
20
|
|
|
20
21
|
from arize._flight.client import ArizeFlightClient
|
|
21
22
|
from arize._flight.types import FlightRequestType
|
|
23
|
+
from arize._generated.api_client import models
|
|
22
24
|
from arize.config import SDKConfiguration
|
|
23
25
|
from arize.exceptions.base import INVALID_ARROW_CONVERSION_MSG
|
|
24
26
|
from arize.experiments.evaluators.base import Evaluators
|
|
@@ -31,6 +33,7 @@ from arize.experiments.types import (
|
|
|
31
33
|
ExperimentTask,
|
|
32
34
|
ExperimentTaskResultFieldNames,
|
|
33
35
|
)
|
|
36
|
+
from arize.utils.cache import cache_resource, load_cached_resource
|
|
34
37
|
from arize.utils.openinference_conversion import (
|
|
35
38
|
convert_boolean_columns_to_str,
|
|
36
39
|
convert_default_columns_to_json_str,
|
|
@@ -57,27 +60,28 @@ class ExperimentsClient:
|
|
|
57
60
|
self._datasets_api = gen.DatasetsApi(
|
|
58
61
|
self._sdk_config.get_generated_client()
|
|
59
62
|
)
|
|
63
|
+
|
|
60
64
|
self.list = self._api.experiments_list
|
|
61
65
|
self.get = self._api.experiments_get
|
|
62
66
|
self.delete = self._api.experiments_delete
|
|
63
|
-
self.list_runs = self._api.experiments_runs_list # REST ?
|
|
64
67
|
|
|
65
68
|
# Custom methods
|
|
66
|
-
self.create = self._create_experiment
|
|
67
69
|
self.run = self._run_experiment
|
|
70
|
+
self.create = self._create_experiment
|
|
71
|
+
self.list_runs = self._api.experiments_runs_list
|
|
68
72
|
|
|
69
73
|
def _run_experiment(
|
|
70
74
|
self,
|
|
71
75
|
name: str,
|
|
72
76
|
dataset_id: str,
|
|
73
77
|
task: ExperimentTask,
|
|
74
|
-
dataset_df: pd.DataFrame | None = None,
|
|
75
78
|
evaluators: Evaluators | None = None,
|
|
76
79
|
dry_run: bool = False,
|
|
80
|
+
dry_run_count: int = 10,
|
|
77
81
|
concurrency: int = 3,
|
|
78
82
|
set_global_tracer_provider: bool = False,
|
|
79
83
|
exit_on_error: bool = False,
|
|
80
|
-
) -> Tuple[
|
|
84
|
+
) -> Tuple[Experiment | None, pd.DataFrame] | None:
|
|
81
85
|
"""
|
|
82
86
|
Run an experiment on a dataset and upload the results.
|
|
83
87
|
|
|
@@ -87,9 +91,6 @@ class ExperimentsClient:
|
|
|
87
91
|
Args:
|
|
88
92
|
experiment_name (str): The name of the experiment.
|
|
89
93
|
task (ExperimentTask): The task to be performed in the experiment.
|
|
90
|
-
dataset_df (Optional[pd.DataFrame], optional): The dataset as a pandas DataFrame.
|
|
91
|
-
If not provided, the dataset will be downloaded using dataset_id or dataset_name.
|
|
92
|
-
Defaults to None.
|
|
93
94
|
dataset_id (Optional[str], optional): The ID of the dataset to use.
|
|
94
95
|
Required if dataset_df and dataset_name are not provided. Defaults to None.
|
|
95
96
|
dataset_name (Optional[str], optional): The name of the dataset to use.
|
|
@@ -116,6 +117,7 @@ class ExperimentsClient:
|
|
|
116
117
|
# should work on server tech debt to remove this
|
|
117
118
|
dataset = self._datasets_api.datasets_get(dataset_id=dataset_id)
|
|
118
119
|
space_id = dataset.space_id
|
|
120
|
+
dataset_updated_at = getattr(dataset, "updated_at", None)
|
|
119
121
|
|
|
120
122
|
with ArizeFlightClient(
|
|
121
123
|
api_key=self._sdk_config.api_key,
|
|
@@ -152,10 +154,20 @@ class ExperimentsClient:
|
|
|
152
154
|
raise RuntimeError(msg)
|
|
153
155
|
experiment_id, trace_model_name = response
|
|
154
156
|
|
|
155
|
-
|
|
157
|
+
dataset_df = None
|
|
158
|
+
# try to load dataset from cache
|
|
159
|
+
if self._sdk_config.enable_caching:
|
|
160
|
+
dataset_df = load_cached_resource(
|
|
161
|
+
cache_dir=self._sdk_config.arize_direcory,
|
|
162
|
+
resource="dataset",
|
|
163
|
+
resource_id=dataset_id,
|
|
164
|
+
resource_updated_at=dataset_updated_at,
|
|
165
|
+
)
|
|
166
|
+
|
|
156
167
|
if dataset_df is None:
|
|
168
|
+
# download dataset
|
|
157
169
|
try:
|
|
158
|
-
|
|
170
|
+
dataset_df = flight_client.get_dataset_examples(
|
|
159
171
|
space_id=space_id,
|
|
160
172
|
dataset_id=dataset_id,
|
|
161
173
|
)
|
|
@@ -163,7 +175,7 @@ class ExperimentsClient:
|
|
|
163
175
|
msg = f"Error during request: {str(e)}"
|
|
164
176
|
logger.error(msg)
|
|
165
177
|
raise RuntimeError(msg) from e
|
|
166
|
-
if
|
|
178
|
+
if dataset_df is None:
|
|
167
179
|
# This should not happen with proper Flight client implementation,
|
|
168
180
|
# but we handle it defensively
|
|
169
181
|
msg = (
|
|
@@ -172,13 +184,21 @@ class ExperimentsClient:
|
|
|
172
184
|
logger.error(msg)
|
|
173
185
|
raise RuntimeError(msg)
|
|
174
186
|
|
|
175
|
-
if dataset_df
|
|
187
|
+
if dataset_df.empty:
|
|
176
188
|
raise ValueError(f"Dataset {dataset_id} is empty")
|
|
177
189
|
|
|
178
|
-
|
|
190
|
+
# cache dataset for future use
|
|
191
|
+
cache_resource(
|
|
192
|
+
cache_dir=self._sdk_config.arize_direcory,
|
|
193
|
+
resource="dataset",
|
|
194
|
+
resource_id=dataset_id,
|
|
195
|
+
resource_updated_at=dataset_updated_at,
|
|
196
|
+
resource_data=dataset_df,
|
|
197
|
+
)
|
|
198
|
+
|
|
179
199
|
if dry_run:
|
|
180
|
-
# only dry_run experiment on a subset (first
|
|
181
|
-
|
|
200
|
+
# only dry_run experiment on a subset (first N rows) of the dataset
|
|
201
|
+
dataset_df = dataset_df.head(dry_run_count)
|
|
182
202
|
|
|
183
203
|
# trace model and resource for the experiment
|
|
184
204
|
tracer, resource = _get_tracer_resource(
|
|
@@ -193,7 +213,7 @@ class ExperimentsClient:
|
|
|
193
213
|
output_df = run_experiment(
|
|
194
214
|
experiment_name=name,
|
|
195
215
|
experiment_id=experiment_id,
|
|
196
|
-
dataset=
|
|
216
|
+
dataset=dataset_df,
|
|
197
217
|
task=task,
|
|
198
218
|
tracer=tracer,
|
|
199
219
|
resource=resource,
|
|
@@ -204,7 +224,7 @@ class ExperimentsClient:
|
|
|
204
224
|
output_df = convert_default_columns_to_json_str(output_df)
|
|
205
225
|
output_df = convert_boolean_columns_to_str(output_df)
|
|
206
226
|
if dry_run:
|
|
207
|
-
return
|
|
227
|
+
return None, output_df
|
|
208
228
|
|
|
209
229
|
# Convert to Arrow table
|
|
210
230
|
try:
|
|
@@ -241,7 +261,10 @@ class ExperimentsClient:
|
|
|
241
261
|
logger.error(msg)
|
|
242
262
|
raise RuntimeError(msg)
|
|
243
263
|
|
|
244
|
-
|
|
264
|
+
experiment = self.get(
|
|
265
|
+
experiment_id=str(post_resp.experiment_id) # type: ignore
|
|
266
|
+
)
|
|
267
|
+
return experiment, output_df
|
|
245
268
|
|
|
246
269
|
def _create_experiment(
|
|
247
270
|
self,
|
|
@@ -352,6 +375,78 @@ class ExperimentsClient:
|
|
|
352
375
|
experiment_df=experiment_df,
|
|
353
376
|
)
|
|
354
377
|
|
|
378
|
+
def _list_runs(
|
|
379
|
+
self,
|
|
380
|
+
experiment_id: str,
|
|
381
|
+
limit: int = 100,
|
|
382
|
+
all: bool = False,
|
|
383
|
+
):
|
|
384
|
+
if not all:
|
|
385
|
+
return self._api.experiments_runs_list(
|
|
386
|
+
experiment_id=experiment_id,
|
|
387
|
+
limit=limit,
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
experiment = self.get(experiment_id=experiment_id)
|
|
391
|
+
experiment_updated_at = getattr(experiment, "updated_at", None)
|
|
392
|
+
# TODO(Kiko): Space ID should not be needed,
|
|
393
|
+
# should work on server tech debt to remove this
|
|
394
|
+
dataset = self._datasets_api.datasets_get(
|
|
395
|
+
dataset_id=experiment.dataset_id
|
|
396
|
+
)
|
|
397
|
+
space_id = dataset.space_id
|
|
398
|
+
|
|
399
|
+
experiment_df = None
|
|
400
|
+
# try to load dataset from cache
|
|
401
|
+
if self._sdk_config.enable_caching:
|
|
402
|
+
experiment_df = load_cached_resource(
|
|
403
|
+
cache_dir=self._sdk_config.arize_direcory,
|
|
404
|
+
resource="experiment",
|
|
405
|
+
resource_id=experiment_id,
|
|
406
|
+
resource_updated_at=experiment_updated_at,
|
|
407
|
+
)
|
|
408
|
+
if experiment_df is not None:
|
|
409
|
+
return models.ExperimentsRunsList200Response(
|
|
410
|
+
experimentRuns=experiment_df.to_dict(orient="records")
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
with ArizeFlightClient(
|
|
414
|
+
api_key=self._sdk_config.api_key,
|
|
415
|
+
host=self._sdk_config.flight_server_host,
|
|
416
|
+
port=self._sdk_config.flight_server_port,
|
|
417
|
+
scheme=self._sdk_config.flight_scheme,
|
|
418
|
+
request_verify=self._sdk_config.request_verify,
|
|
419
|
+
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
420
|
+
) as flight_client:
|
|
421
|
+
try:
|
|
422
|
+
experiment_df = flight_client.get_experiment_runs(
|
|
423
|
+
space_id=space_id,
|
|
424
|
+
experiment_id=experiment_id,
|
|
425
|
+
)
|
|
426
|
+
except Exception as e:
|
|
427
|
+
msg = f"Error during request: {str(e)}"
|
|
428
|
+
logger.error(msg)
|
|
429
|
+
raise RuntimeError(msg) from e
|
|
430
|
+
if experiment_df is None:
|
|
431
|
+
# This should not happen with proper Flight client implementation,
|
|
432
|
+
# but we handle it defensively
|
|
433
|
+
msg = "No response received from flight server during request"
|
|
434
|
+
logger.error(msg)
|
|
435
|
+
raise RuntimeError(msg)
|
|
436
|
+
|
|
437
|
+
# cache dataset for future use
|
|
438
|
+
cache_resource(
|
|
439
|
+
cache_dir=self._sdk_config.arize_direcory,
|
|
440
|
+
resource="dataset",
|
|
441
|
+
resource_id=experiment_id,
|
|
442
|
+
resource_updated_at=experiment_updated_at,
|
|
443
|
+
resource_data=experiment_df,
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
return models.ExperimentsRunsList200Response(
|
|
447
|
+
experimentRuns=experiment_df.to_dict(orient="records")
|
|
448
|
+
)
|
|
449
|
+
|
|
355
450
|
def _create_experiment_via_flight(
|
|
356
451
|
self,
|
|
357
452
|
name: str,
|
|
@@ -463,3 +558,9 @@ def _get_tracer_resource(
|
|
|
463
558
|
trace.set_tracer_provider(tracer_provider)
|
|
464
559
|
|
|
465
560
|
return tracer_provider.get_tracer(__name__), resource
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _dataset_cache_key(dataset_id: str, dataset_updated_at: str | None) -> str:
|
|
564
|
+
# include updated_at if present to produce a new key when dataset changes
|
|
565
|
+
key_src = f"{dataset_id}:{dataset_updated_at or ''}"
|
|
566
|
+
return hashlib.sha256(key_src.encode("utf-8")).hexdigest()
|
arize/utils/cache.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def load_cached_resource(
|
|
12
|
+
cache_dir: str,
|
|
13
|
+
resource: str,
|
|
14
|
+
resource_id: str,
|
|
15
|
+
resource_updated_at: str | None,
|
|
16
|
+
format: str = "parquet",
|
|
17
|
+
) -> pd.DataFrame | None:
|
|
18
|
+
key = _get_cache_key(resource, resource_id, resource_updated_at)
|
|
19
|
+
filepath = _get_abs_file_path(cache_dir, f"{key}.{format}", resource)
|
|
20
|
+
if not filepath.exists():
|
|
21
|
+
return None
|
|
22
|
+
try:
|
|
23
|
+
return pd.read_parquet(filepath)
|
|
24
|
+
except Exception as e:
|
|
25
|
+
logger.warning(f"Failed to load cached resource from {filepath}: {e}")
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def cache_resource(
|
|
30
|
+
cache_dir: str,
|
|
31
|
+
resource: str,
|
|
32
|
+
resource_id: str,
|
|
33
|
+
resource_updated_at: str | None,
|
|
34
|
+
resource_data: pd.DataFrame,
|
|
35
|
+
format: str = "parquet",
|
|
36
|
+
) -> None:
|
|
37
|
+
key = _get_cache_key(resource, resource_id, resource_updated_at)
|
|
38
|
+
filepath = _get_abs_file_path(cache_dir, f"{key}.{format}", resource)
|
|
39
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
resource_data.to_parquet(filepath, index=False)
|
|
41
|
+
logger.debug(f"Cached resource to {filepath}")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _get_cache_key(
|
|
45
|
+
resource: str,
|
|
46
|
+
resource_id: str,
|
|
47
|
+
resource_updated_at: str | None,
|
|
48
|
+
) -> str:
|
|
49
|
+
# include updated_at if present to produce a new key when dataset changes
|
|
50
|
+
key = f"{resource}_{resource_id}"
|
|
51
|
+
if resource_updated_at:
|
|
52
|
+
key += f"_{resource_updated_at}"
|
|
53
|
+
return key
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _get_abs_file_path(
|
|
57
|
+
directory: str,
|
|
58
|
+
filename: str,
|
|
59
|
+
subdirectory: str | None = None,
|
|
60
|
+
) -> Path:
|
|
61
|
+
"""
|
|
62
|
+
Return an absolute path to a file located under `directory[/subdirectory]/filename`.
|
|
63
|
+
Expands '~' and resolves relative components.
|
|
64
|
+
"""
|
|
65
|
+
base = Path(directory).expanduser()
|
|
66
|
+
if subdirectory:
|
|
67
|
+
base = base / subdirectory
|
|
68
|
+
return (base / filename).resolve()
|
arize/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "8.0.
|
|
1
|
+
__version__ = "8.0.0a17"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arize
|
|
3
|
-
Version: 8.0.
|
|
3
|
+
Version: 8.0.0a17
|
|
4
4
|
Summary: A helper library to interact with Arize AI APIs
|
|
5
5
|
Project-URL: Homepage, https://arize.com
|
|
6
6
|
Project-URL: Documentation, https://docs.arize.com/arize
|
|
@@ -99,11 +99,24 @@ Description-Content-Type: text/markdown
|
|
|
99
99
|
- [Operations on Datasets](#operations-on-datasets)
|
|
100
100
|
- [List Datasets](#list-datasets)
|
|
101
101
|
- [Create a Dataset](#create-a-dataset)
|
|
102
|
-
- [Get Dataset
|
|
102
|
+
- [Get Dataset](#get-dataset)
|
|
103
103
|
- [Delete a Dataset](#delete-a-dataset)
|
|
104
|
-
- [
|
|
105
|
-
- [
|
|
106
|
-
|
|
104
|
+
- [List Dataset Examples](#list-dataset-examples)
|
|
105
|
+
- [Operations on Experiments](#operations-on-experiments)
|
|
106
|
+
- [List Experiments](#list-experiments)
|
|
107
|
+
- [Run an Experiment](#run-an-experiment)
|
|
108
|
+
- [Create an Experiment](#create-an-experiment)
|
|
109
|
+
- [Get an Experiment](#get-an-experiment)
|
|
110
|
+
- [Delete an Experiment](#delete-an-experiment)
|
|
111
|
+
- [List Experiment runs](#list-experiment-runs)
|
|
112
|
+
- [SDK Configuration](#sdk-configuration)
|
|
113
|
+
- [Logging](#logging)
|
|
114
|
+
- [In Code](#in-code)
|
|
115
|
+
- [Via Environment Variables](#via-environment-variables)
|
|
116
|
+
- [Caching](#caching)
|
|
117
|
+
- [In Code](#in-code-1)
|
|
118
|
+
- [Via Environment Variables](#via-environment-variables-1)
|
|
119
|
+
- [Clean the cache](#clean-the-cache)
|
|
107
120
|
- [Community](#community)
|
|
108
121
|
|
|
109
122
|
# Overview
|
|
@@ -398,9 +411,9 @@ dataset_list = resp.datasets
|
|
|
398
411
|
# Get the response as a dictionary
|
|
399
412
|
resp_dict = resp.to_dict()
|
|
400
413
|
# Get the response in JSON format
|
|
401
|
-
|
|
414
|
+
resp_json = resp.to_json()
|
|
402
415
|
# Get the response as a pandas dataframe
|
|
403
|
-
|
|
416
|
+
resp_df = resp.to_df()
|
|
404
417
|
```
|
|
405
418
|
|
|
406
419
|
### Create a Dataset
|
|
@@ -430,9 +443,10 @@ If the number of examples (rows in dataframe, items in list) is too large, the c
|
|
|
430
443
|
|
|
431
444
|
```python
|
|
432
445
|
created_dataset = client.datasets.create(
|
|
433
|
-
|
|
446
|
+
space_id="<target-space-id>",
|
|
434
447
|
name="<your-dataset-name>", # Name must be unique within a space
|
|
435
448
|
examples=..., # List of dictionaries or pandas dataframe
|
|
449
|
+
# force_http=... # Optionally pass force_http to create datasets via HTTP instead of gRPC, defaults to False
|
|
436
450
|
)
|
|
437
451
|
```
|
|
438
452
|
|
|
@@ -445,8 +459,7 @@ dataset_dict = create_dataset.to_dict()
|
|
|
445
459
|
dataset_dict = create_dataset.to_json()
|
|
446
460
|
```
|
|
447
461
|
|
|
448
|
-
|
|
449
|
-
### Get Dataset by ID
|
|
462
|
+
### Get Dataset
|
|
450
463
|
|
|
451
464
|
To get a dataset by its ID use `client.datasets.get()`, you can optionally also pass the version ID of a particular version of interest of the dataset. The returned type is `Dataset`.
|
|
452
465
|
|
|
@@ -467,9 +480,167 @@ client.datasets.delete(
|
|
|
467
480
|
)
|
|
468
481
|
```
|
|
469
482
|
|
|
470
|
-
|
|
483
|
+
### List Dataset Examples
|
|
484
|
+
|
|
485
|
+
You can list the examples of a given dataset using `client.datasets.list_examples()` and passing the dataset ID and, optionally, the dataset version ID. You can specify the number of examples desired using the `limit` parameter. If you want a large number of examples, consider using the `all=True` parameter, which will make it so the SDK exports the data using Arrow Flight via gRPC, for increased performance.
|
|
486
|
+
|
|
487
|
+
```python
|
|
488
|
+
resp = client.datasets.list_examples(
|
|
489
|
+
dataset_id="your-dataset-id>",
|
|
490
|
+
dataset_version_id="your-dataset-version-id>", # Optional, defaults to latest version
|
|
491
|
+
limit=... # number of desired examples. Defaults to 100
|
|
492
|
+
all=... # Whether or not to export all of the examples. Defaults to False
|
|
493
|
+
)
|
|
494
|
+
```
|
|
495
|
+
|
|
496
|
+
The response is an object of type `DatasetsExamplesList200Response`, and you can access the list of examples via its `examples` attribute. In addition, you can transform the response object to a dictionary, to JSON format, or a pandas dataframe.
|
|
497
|
+
|
|
498
|
+
```python
|
|
499
|
+
# Get the list of datasets from the response
|
|
500
|
+
examples_list = resp.examples
|
|
501
|
+
# Get the response as a dictionary
|
|
502
|
+
resp_dict = resp.to_dict()
|
|
503
|
+
# Get the response in JSON format
|
|
504
|
+
resp_json = resp.to_json()
|
|
505
|
+
# Get the response as a pandas dataframe
|
|
506
|
+
resp_df = resp.to_df()
|
|
507
|
+
```
|
|
508
|
+
|
|
509
|
+
## Operations on Experiments
|
|
510
|
+
|
|
511
|
+
### List Experiments
|
|
512
|
+
|
|
513
|
+
You can list all experiments that the user has access to using `client.experiments.list()`. You can use the `limit` parameter to specify the maximum number of datasets desired in the response and you can specify the `dataset_id` to target the list operation to a particular dataset.
|
|
514
|
+
|
|
515
|
+
```python
|
|
516
|
+
resp = client.experiments.list(
|
|
517
|
+
limit=... # Optional
|
|
518
|
+
dataset_id=... # Optional
|
|
519
|
+
)
|
|
520
|
+
```
|
|
521
|
+
|
|
522
|
+
The response is an object of type `ExperimentsList200Response`, and you can access the list of experiments via its `experiments` attribute. In addition, you can transform the response object to a dictionary, to JSON format, or a pandas dataframe.
|
|
523
|
+
|
|
524
|
+
```python
|
|
525
|
+
# Get the list of datasets from the response
|
|
526
|
+
experiment_list = resp.experiments
|
|
527
|
+
# Get the response as a dictionary
|
|
528
|
+
resp_dict = resp.to_dict()
|
|
529
|
+
# Get the response in JSON format
|
|
530
|
+
resp_json = resp.to_json()
|
|
531
|
+
# Get the response as a pandas dataframe
|
|
532
|
+
resp_df = resp.to_df()
|
|
533
|
+
```
|
|
534
|
+
|
|
535
|
+
### Run an Experiment
|
|
536
|
+
|
|
537
|
+
You can run an experiment on a dataset using `client.experiments.run()` by defining a task, evaluators (optional), and passing the dataset id of the dataset you want to use, together with a name for the experiment. The function will download the entire dataset from Arize (unless cached, see caching section under "SDK Configuration"), execute the task to obtain an output, and perform evaluations (if evaluators were passed). The experiments will also be traced, and these traces will be visible in Arize. The experiment will be created and the data logged into Arize automatically. You can avoid logging to Arize by making `dry_run=True`. The function will return the `Experiment` object (or `None` if `dry_run=True`) together with the dataframe with the experiment data.
|
|
538
|
+
|
|
539
|
+
```python
|
|
540
|
+
experiment, experiment_df = client.run_experiment(
|
|
541
|
+
name="<name-your-experiment>",
|
|
542
|
+
dataset_id="<id-of-dataset-to-use>",
|
|
543
|
+
task=... # The task to be performed in the experiment.
|
|
544
|
+
evaluators=... # Optional: The evaluators to use in the experiment.
|
|
545
|
+
dry_run=..., # If True, the experiment result will not be uploaded to Arize. Defaults to False
|
|
546
|
+
dry_run_count=..., # Number of examples of the dataset to use in the dry run. Defaults to 10
|
|
547
|
+
concurrency=..., # The number of concurrent tasks to run. Defaults to 3.
|
|
548
|
+
set_global_tracer_provider=..., # If True, sets the global tracer provider for the experiment. Defaults to False
|
|
549
|
+
exit_on_error=..., # If True, the experiment will stop running on first occurrence of an error. Defaults to False
|
|
550
|
+
)
|
|
551
|
+
```
|
|
552
|
+
|
|
553
|
+
The `Experiment` object also counts with convenience method similar to `List***` objects:
|
|
554
|
+
|
|
555
|
+
```python
|
|
556
|
+
# Get the response as a dictionary
|
|
557
|
+
experiment_dict = create_experiment.to_dict()
|
|
558
|
+
# Get the response in JSON format
|
|
559
|
+
experiment_dict = create_experiment.to_json()
|
|
560
|
+
```
|
|
561
|
+
|
|
562
|
+
### Create an Experiment
|
|
563
|
+
|
|
564
|
+
It is possible that you have run the experiment yourself without the above function, and hence you already have experiment data that you want to send to Arize. In this case, use the `client.experiments.create()` method by passing the runs data, we currently don't support creating an empty experiment, for instance, these are 2 rows of runs, as a list of dictionaries. You can also pass a pandas dataframe for the runs data.
|
|
565
|
+
|
|
566
|
+
> NOTE: If you don't have experiment data and want to run experiment, see the `client.experiments.run()` section above.
|
|
567
|
+
|
|
568
|
+
```python
|
|
569
|
+
# TODO
|
|
570
|
+
runs = [
|
|
571
|
+
]
|
|
572
|
+
```
|
|
573
|
+
|
|
574
|
+
In addition, you must specify which columns are the `example_id` and the `result`, you can do so by using the `ExperimentTaskResultFieldNames`. Moreover, if you choose to pass evaluation data, you can indicate the evaluation columns using `EvaluationResultFieldNames`:
|
|
575
|
+
|
|
576
|
+
```python
|
|
577
|
+
# TODO
|
|
578
|
+
```
|
|
579
|
+
|
|
580
|
+
If the number of runs (rows in dataframe, items in list) is too large, the client SDK will try to send the data via Arrow Flight via gRPC for better performance. If you want to force the data transfer to HTTP you can use the `force_http` flag. The response is an `Experiment` object.
|
|
581
|
+
|
|
582
|
+
```python
|
|
583
|
+
created_experiment = client.experiments.create(
|
|
584
|
+
name="<your-experiment-name>", # Name must be unique within a dataset
|
|
585
|
+
dataset_id="<desired-dataset-id>",
|
|
586
|
+
experiment_runs=..., # List of dictionaries or pandas dataframe
|
|
587
|
+
task_fields=ExperimentTaskResultFieldNames(...),
|
|
588
|
+
evaluator_columns=... # Optional
|
|
589
|
+
# force_http=... # Optionally pass force_http to create experiments via HTTP instead of gRPC, defaults to False
|
|
590
|
+
)
|
|
591
|
+
```
|
|
592
|
+
|
|
593
|
+
### Get an Experiment
|
|
594
|
+
|
|
595
|
+
To get a dataset by its ID use `client.datasets.get()`, you can optionally also pass the version ID of a particular version of interest of the dataset. The returned type is `Dataset`.
|
|
596
|
+
|
|
597
|
+
```python
|
|
598
|
+
dataset = client.datasets.get(
|
|
599
|
+
dataset_id=... # The unique identifier of the dataset
|
|
600
|
+
dataset_version_id=... # The unique identifier of the dataset version
|
|
601
|
+
)
|
|
602
|
+
```
|
|
603
|
+
|
|
604
|
+
### Delete an Experiment
|
|
605
|
+
|
|
606
|
+
To delete an experiment by its ID use `client.experiments.delete()`. The call returns `None` if successful deletion took place, error otherwise.
|
|
607
|
+
|
|
608
|
+
```python
|
|
609
|
+
client.experiments.delete(
|
|
610
|
+
experiment_id=... # The unique identifier of the experiment
|
|
611
|
+
)
|
|
612
|
+
```
|
|
613
|
+
|
|
614
|
+
### List Experiment runs
|
|
471
615
|
|
|
472
|
-
|
|
616
|
+
You can list the runs of a given experiment using `client.experiments.list_runs()` and passing the experiment ID. You can specify the number of runs desired using the `limit` parameter. If you want a large number of runs, consider using the `all=True` parameter, which will make it so the SDK exports the data using Arrow Flight via gRPC, for increased performance.
|
|
617
|
+
|
|
618
|
+
```python
|
|
619
|
+
resp = client.experiments.list_runs(
|
|
620
|
+
experiment_id="your-experiment-id>",
|
|
621
|
+
limit=... # number of desired runs. Defaults to 100
|
|
622
|
+
all=... # Whether or not to export all of the runs. Defaults to False
|
|
623
|
+
)
|
|
624
|
+
```
|
|
625
|
+
|
|
626
|
+
The response is an object of type `ExperimentsRunsList200Response`, and you can access the list of runs via its `experiment_runs` attribute. In addition, you can transform the response object to a dictionary, to JSON format, or a pandas dataframe.
|
|
627
|
+
|
|
628
|
+
```python
|
|
629
|
+
# Get the list of datasets from the response
|
|
630
|
+
run_list = resp.experiments_runs
|
|
631
|
+
# Get the response as a dictionary
|
|
632
|
+
resp_dict = resp.to_dict()
|
|
633
|
+
# Get the response in JSON format
|
|
634
|
+
resp_json = resp.to_json()
|
|
635
|
+
# Get the response as a pandas dataframe
|
|
636
|
+
resp_df = resp.to_df()
|
|
637
|
+
```
|
|
638
|
+
|
|
639
|
+
# SDK Configuration
|
|
640
|
+
|
|
641
|
+
## Logging
|
|
642
|
+
|
|
643
|
+
### In Code
|
|
473
644
|
|
|
474
645
|
You can use `configure_logging` to set up the logging behavior of the Arize package to your needs.
|
|
475
646
|
|
|
@@ -482,14 +653,14 @@ configure_logging(
|
|
|
482
653
|
)
|
|
483
654
|
```
|
|
484
655
|
|
|
485
|
-
|
|
656
|
+
### Via Environment Variables
|
|
486
657
|
|
|
487
658
|
Configure the same options as the section above, via:
|
|
488
659
|
|
|
489
660
|
```python
|
|
490
661
|
import os
|
|
491
662
|
|
|
492
|
-
#
|
|
663
|
+
# Whether or not you want to disable logging altogether
|
|
493
664
|
os.environ["ARIZE_LOG_ENABLE"] = "true"
|
|
494
665
|
# Set up the logging level
|
|
495
666
|
os.environ["ARIZE_LOG_LEVEL"] = "debug"
|
|
@@ -499,6 +670,38 @@ os.environ["ARIZE_LOG_STRUCTURED"] = "false"
|
|
|
499
670
|
|
|
500
671
|
The default behavior of Arize's logs is: enabled, `INFO` level, and not structured.
|
|
501
672
|
|
|
673
|
+
## Caching
|
|
674
|
+
|
|
675
|
+
When downloading big segments of data from Arize, such as a `Dataset` with all of its examples, the SDK will cache the file in `parquet` format under `~/.arize/datasets/dataset_<updated_at_timestamp>.parquet`.
|
|
676
|
+
|
|
677
|
+
### In Code
|
|
678
|
+
|
|
679
|
+
You can disable caching via the `enable_caching` parameter when instantiating the client, and also edit the "arize directory":
|
|
680
|
+
|
|
681
|
+
```python
|
|
682
|
+
client = ArizeClient(
|
|
683
|
+
enable_caching=False, # Optional parameter, defaults to True
|
|
684
|
+
arize_directory="my-desired-directory", # Optional parameter, defaults to ~/.arize
|
|
685
|
+
)
|
|
686
|
+
```
|
|
687
|
+
|
|
688
|
+
### Via Environment Variables
|
|
689
|
+
|
|
690
|
+
You can also configure the above via:
|
|
691
|
+
|
|
692
|
+
```python
|
|
693
|
+
import os
|
|
694
|
+
|
|
695
|
+
# Whether or not you want to disable caching
|
|
696
|
+
os.environ["ARIZE_ENABLE_CACHING"] = "true"
|
|
697
|
+
# Where you want the SDK to store the files
|
|
698
|
+
os.environ["ARIZE_DIRECTORY"] = "~/.arize"
|
|
699
|
+
```
|
|
700
|
+
|
|
701
|
+
### Clean the cache
|
|
702
|
+
|
|
703
|
+
To clean the cache you can directly `rm` the files or directory.
|
|
704
|
+
|
|
502
705
|
# Community
|
|
503
706
|
|
|
504
707
|
Join our community to connect with thousands of AI builders.
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
arize/__init__.py,sha256=
|
|
1
|
+
arize/__init__.py,sha256=G9wbTaZsccUIwntIriIIW74lS1-tHeG58Vt4XV1ZV9s,3002
|
|
2
2
|
arize/_lazy.py,sha256=1Lnm4l42t7W-m2JYCYD-S7ASBOIl0XJkBuli3Ei1VXA,2474
|
|
3
|
-
arize/client.py,sha256
|
|
4
|
-
arize/config.py,sha256=
|
|
3
|
+
arize/client.py,sha256=-SeZloT7qqWRtr1WXS5d2yn7gvpNYYyGE2yjGPvYi74,7236
|
|
4
|
+
arize/config.py,sha256=PDKUkJfGvTxX2NZ5FLxXz1YaXBOuAkyL5eW7kdbZc5A,7909
|
|
5
5
|
arize/logging.py,sha256=OahBaJRG-z5DPqWrj2_rbe2n0r4fMGOrXpxN_4M_i_w,7244
|
|
6
6
|
arize/types.py,sha256=z1yg5-brmTD4kVHDmmTVkYke53JpusXXeOOpdQw7rYg,69508
|
|
7
|
-
arize/version.py,sha256=
|
|
7
|
+
arize/version.py,sha256=dVbZUbQ1PraD-0qvMFzVVGSr1QRGrJYBgb-CUfl0LQc,25
|
|
8
8
|
arize/_exporter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
arize/_exporter/client.py,sha256=k3xS-2wx_UlB5toI5RKBoy1bi3ONIxh4KQy4A4a2Omc,15822
|
|
10
10
|
arize/_exporter/validation.py,sha256=6ROu5p7uaolxQ93lO_Eiwv9NVw_uyi3E5T--C5Klo5Q,1021
|
|
11
11
|
arize/_exporter/parsers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
arize/_exporter/parsers/tracing_data_parser.py,sha256=zVS-w8t1HJkz-AIC_JCdjPJ7gJXgFpfELfqNM_vK42E,5395
|
|
13
13
|
arize/_flight/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
-
arize/_flight/client.py,sha256=
|
|
14
|
+
arize/_flight/client.py,sha256=14dYkHM0Pi-GP1AeNPQX-RQ3uMmtwRwxoSmR7--1eW0,15499
|
|
15
15
|
arize/_flight/types.py,sha256=GB_4dQu2ElIrcDGAcqhG7oI4g-b0ZdSlbrQkf0TFzVE,194
|
|
16
16
|
arize/_generated/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
arize/_generated/api_client_README.md,sha256=OSAc24mxj4fZB7k0i8DIZ8uoXfn6hGjptO5om6ferRE,5632
|
|
@@ -55,14 +55,14 @@ arize/_generated/protocol/flight/ingest_pb2.py,sha256=-wC5rbLK4yjROQuXOU9c_gPwA4
|
|
|
55
55
|
arize/_generated/protocol/rec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
56
56
|
arize/_generated/protocol/rec/public_pb2.py,sha256=vgP-yTSZLeomVwfIzcOo6t3i1mPCCNJGgd41ZkfLNng,79898
|
|
57
57
|
arize/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
|
-
arize/constants/config.py,sha256=
|
|
58
|
+
arize/constants/config.py,sha256=RvvMZrhbMSv3_Do1jKTVGyWt_Pwal82pIL4S9FH0XS4,1518
|
|
59
59
|
arize/constants/ml.py,sha256=X_vtKpt1AdhLoT2DWEyKDSXAVEuzjwGFacIbgUOpB3M,2358
|
|
60
60
|
arize/constants/model_mapping.json,sha256=OPE54rBATzmwRhx0tycsxnGae1jBhtqEmQqQvzleTSc,5725
|
|
61
61
|
arize/constants/openinference.py,sha256=3tVLyUz6ZvE8ht_ZLnndYXFhDjt_ibJbFeBM1PcxIbY,532
|
|
62
62
|
arize/constants/pyarrow.py,sha256=XUZQXQ-431fQYM2ZJy6xRwW4pfABPg7NZspQ5BXAxRc,24
|
|
63
63
|
arize/constants/spans.py,sha256=EfMgbEIK_2EUcvUY5BGnNAbS7bupBKePlI3j2L5T5CE,2532
|
|
64
64
|
arize/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
65
|
-
arize/datasets/client.py,sha256=
|
|
65
|
+
arize/datasets/client.py,sha256=g4qAWYkteDjcw8EgTdr4XBrtT0JYF7ewD8D-slNxAZ4,8970
|
|
66
66
|
arize/datasets/errors.py,sha256=9hmE7KyBWBSi4FkVQYsI3E-KPgzXaCZc681czNBhS-Q,1685
|
|
67
67
|
arize/datasets/validation.py,sha256=KT_X9bnEMxGbh2o9N3aXwgTMVOQPzz1AW-JyaKxcs48,1336
|
|
68
68
|
arize/embeddings/__init__.py,sha256=6_C8908W_qDixkoBJl1wapgmQCzI8TPLH207kzbYsFA,156
|
|
@@ -83,7 +83,7 @@ arize/exceptions/spaces.py,sha256=C1mbtbUx7bVFnGM7iJg03pttnd-jVl2dnFmO102wXrA,31
|
|
|
83
83
|
arize/exceptions/types.py,sha256=ALzH6S63zbHSno2n6Lp3lRf7Galo-HctrkkDU61fKBo,6050
|
|
84
84
|
arize/exceptions/values.py,sha256=aNAL4P9nN0LOtuHrIARBbty2V0ZtMgBsT1wyz1fB6Kk,18948
|
|
85
85
|
arize/experiments/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
86
|
-
arize/experiments/client.py,sha256=
|
|
86
|
+
arize/experiments/client.py,sha256=VXK2Dl8wOruvX6yLeHyhSMJ-hZIRh9AXdzrYNgxl4pM,22329
|
|
87
87
|
arize/experiments/functions.py,sha256=-6yAumc4ZZxoouEnKXkR8GxFqEFfDBCOOC3j6OAVt40,33833
|
|
88
88
|
arize/experiments/tracing.py,sha256=DGhJrJU2yUchMUVWPr_4PTqmM0VbSiNnRoV08hnN4nU,9660
|
|
89
89
|
arize/experiments/types.py,sha256=EEf0EdjldNX6Hg98bX0E9HtZeu__3Ofy0x9fDqrflAg,12752
|
|
@@ -133,13 +133,14 @@ arize/spans/validation/spans/spans_validation.py,sha256=p6IjbQMtOhotGBfw3axj7yMW
|
|
|
133
133
|
arize/spans/validation/spans/value_validation.py,sha256=H3qV96w6JQNCed_MxhWDas9Jf6vUj6RFabShcwf4jr4,19102
|
|
134
134
|
arize/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
135
135
|
arize/utils/arrow.py,sha256=6kbTY3mPL8oAk9C3sL-vE5dLuQ7bNU74qbRHcSbuIBg,5334
|
|
136
|
+
arize/utils/cache.py,sha256=5KP6D-Dru-HjB7hSwFttUf8B4veXNqK7wq82B4bfECU,1892
|
|
136
137
|
arize/utils/dataframe.py,sha256=I0FloPgNiqlKga32tMOvTE70598QA8Hhrgf-6zjYMAM,1120
|
|
137
138
|
arize/utils/openinference_conversion.py,sha256=i3QBngObcc-LrUWFe_pg9egrFs2pqqbFSncUA-wnqNE,1679
|
|
138
139
|
arize/utils/proto.py,sha256=RfdiXtq2cvIG1IV8W0jz2m-vdrA2CD8f542UUi6GLoY,381
|
|
139
140
|
arize/utils/size.py,sha256=uAM-bs7Jk7fIu6vjQ9khZuJZnpAmFvA3lTXiRT0aJS4,788
|
|
140
141
|
arize/utils/online_tasks/__init__.py,sha256=nDuTLUTYnZaWgyJoYR1P7O8ZKA-Nba7X6tJ9OislbWM,144
|
|
141
142
|
arize/utils/online_tasks/dataframe_preprocessor.py,sha256=YyeeeFu_FwCYImbYvBZvQIH_5TK2lHru8KSfqV893ps,8884
|
|
142
|
-
arize-8.0.
|
|
143
|
-
arize-8.0.
|
|
144
|
-
arize-8.0.
|
|
145
|
-
arize-8.0.
|
|
143
|
+
arize-8.0.0a17.dist-info/METADATA,sha256=FUSvD19Y91lZs32i1d3nDB1oM8Aqv38LTaL2LDlouyE,28471
|
|
144
|
+
arize-8.0.0a17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
145
|
+
arize-8.0.0a17.dist-info/licenses/LICENSE.md,sha256=8vLN8Gms62NCBorxIv9MUvuK7myueb6_-dhXHPmm4H0,1479
|
|
146
|
+
arize-8.0.0a17.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|