arize 8.0.0a14__py3-none-any.whl → 8.0.0a16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +70 -1
- arize/_flight/client.py +163 -43
- arize/_flight/types.py +1 -0
- arize/_generated/api_client/__init__.py +5 -1
- arize/_generated/api_client/api/datasets_api.py +6 -6
- arize/_generated/api_client/api/experiments_api.py +924 -61
- arize/_generated/api_client/api_client.py +1 -1
- arize/_generated/api_client/configuration.py +1 -1
- arize/_generated/api_client/exceptions.py +1 -1
- arize/_generated/api_client/models/__init__.py +3 -1
- arize/_generated/api_client/models/dataset.py +2 -2
- arize/_generated/api_client/models/dataset_version.py +1 -1
- arize/_generated/api_client/models/datasets_create_request.py +3 -3
- arize/_generated/api_client/models/datasets_list200_response.py +1 -1
- arize/_generated/api_client/models/datasets_list_examples200_response.py +1 -1
- arize/_generated/api_client/models/error.py +1 -1
- arize/_generated/api_client/models/experiment.py +6 -6
- arize/_generated/api_client/models/experiments_create_request.py +98 -0
- arize/_generated/api_client/models/experiments_list200_response.py +1 -1
- arize/_generated/api_client/models/experiments_runs_list200_response.py +92 -0
- arize/_generated/api_client/rest.py +1 -1
- arize/_generated/api_client/test/test_dataset.py +2 -1
- arize/_generated/api_client/test/test_dataset_version.py +1 -1
- arize/_generated/api_client/test/test_datasets_api.py +1 -1
- arize/_generated/api_client/test/test_datasets_create_request.py +2 -1
- arize/_generated/api_client/test/test_datasets_list200_response.py +1 -1
- arize/_generated/api_client/test/test_datasets_list_examples200_response.py +1 -1
- arize/_generated/api_client/test/test_error.py +1 -1
- arize/_generated/api_client/test/test_experiment.py +6 -1
- arize/_generated/api_client/test/test_experiments_api.py +23 -2
- arize/_generated/api_client/test/test_experiments_create_request.py +61 -0
- arize/_generated/api_client/test/test_experiments_list200_response.py +1 -1
- arize/_generated/api_client/test/test_experiments_runs_list200_response.py +56 -0
- arize/_generated/api_client_README.md +13 -8
- arize/client.py +19 -2
- arize/config.py +50 -3
- arize/constants/config.py +8 -2
- arize/constants/openinference.py +14 -0
- arize/constants/pyarrow.py +1 -0
- arize/datasets/__init__.py +0 -70
- arize/datasets/client.py +106 -19
- arize/datasets/errors.py +61 -0
- arize/datasets/validation.py +46 -0
- arize/experiments/client.py +455 -0
- arize/experiments/evaluators/__init__.py +0 -0
- arize/experiments/evaluators/base.py +255 -0
- arize/experiments/evaluators/exceptions.py +10 -0
- arize/experiments/evaluators/executors.py +502 -0
- arize/experiments/evaluators/rate_limiters.py +277 -0
- arize/experiments/evaluators/types.py +122 -0
- arize/experiments/evaluators/utils.py +198 -0
- arize/experiments/functions.py +920 -0
- arize/experiments/tracing.py +276 -0
- arize/experiments/types.py +394 -0
- arize/models/client.py +4 -1
- arize/spans/client.py +16 -20
- arize/utils/arrow.py +4 -3
- arize/utils/openinference_conversion.py +56 -0
- arize/utils/proto.py +13 -0
- arize/utils/size.py +22 -0
- arize/version.py +1 -1
- {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/METADATA +3 -1
- {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/RECORD +65 -44
- {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/WHEEL +0 -0
- {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Arize REST API
|
|
5
|
+
|
|
6
|
+
API specification for the backend data server. The API is hosted globally at https://api.arize.com/v2 or in your own environment. You can access the OpenAPI spec for this API at https://api.arize.com/v2/spec.yaml
|
|
7
|
+
|
|
8
|
+
The version of the OpenAPI document: 0.0.1
|
|
9
|
+
Generated by OpenAPI Generator (https://openapi-generator.tech)
|
|
10
|
+
|
|
11
|
+
Do not edit the class manually.
|
|
12
|
+
""" # noqa: E501
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
import unittest
|
|
16
|
+
|
|
17
|
+
from arize._generated.api_client.models.experiments_runs_list200_response import ExperimentsRunsList200Response
|
|
18
|
+
|
|
19
|
+
class TestExperimentsRunsList200Response(unittest.TestCase):
|
|
20
|
+
"""ExperimentsRunsList200Response unit test stubs"""
|
|
21
|
+
|
|
22
|
+
def setUp(self):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
def tearDown(self):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
def make_instance(self, include_optional) -> ExperimentsRunsList200Response:
|
|
29
|
+
"""Test ExperimentsRunsList200Response
|
|
30
|
+
include_optional is a boolean, when False only required
|
|
31
|
+
params are included, when True both required and
|
|
32
|
+
optional params are included """
|
|
33
|
+
# uncomment below to create an instance of `ExperimentsRunsList200Response`
|
|
34
|
+
"""
|
|
35
|
+
model = ExperimentsRunsList200Response()
|
|
36
|
+
if include_optional:
|
|
37
|
+
return ExperimentsRunsList200Response(
|
|
38
|
+
experiment_runs = [
|
|
39
|
+
{ }
|
|
40
|
+
]
|
|
41
|
+
)
|
|
42
|
+
else:
|
|
43
|
+
return ExperimentsRunsList200Response(
|
|
44
|
+
experiment_runs = [
|
|
45
|
+
{ }
|
|
46
|
+
],
|
|
47
|
+
)
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def testExperimentsRunsList200Response(self):
|
|
51
|
+
"""Test ExperimentsRunsList200Response"""
|
|
52
|
+
# inst_req_only = self.make_instance(include_optional=False)
|
|
53
|
+
# inst_req_and_optional = self.make_instance(include_optional=True)
|
|
54
|
+
|
|
55
|
+
if __name__ == '__main__':
|
|
56
|
+
unittest.main()
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# Arize
|
|
2
|
-
API specification for the backend data server. The API is hosted globally at https://
|
|
2
|
+
API specification for the backend data server. The API is hosted globally at https://api.arize.com/v2 or in your own environment. You can access the OpenAPI spec for this API at https://api.arize.com/v2/spec.yaml
|
|
3
3
|
|
|
4
4
|
The `arize._generated.api_client` package is automatically generated by the [OpenAPI Generator](https://openapi-generator.tech) project:
|
|
5
5
|
|
|
@@ -74,13 +74,16 @@ All URIs are relative to *http://localhost*
|
|
|
74
74
|
|
|
75
75
|
Class | Method | HTTP request | Description
|
|
76
76
|
------------ | ------------- | ------------- | -------------
|
|
77
|
-
*DatasetsApi* | [**datasets_create**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_create) | **POST** /
|
|
78
|
-
*DatasetsApi* | [**datasets_delete**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_delete) | **DELETE** /
|
|
79
|
-
*DatasetsApi* | [**datasets_get**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_get) | **GET** /
|
|
80
|
-
*DatasetsApi* | [**datasets_list**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_list) | **GET** /
|
|
81
|
-
*DatasetsApi* | [**datasets_list_examples**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_list_examples) | **GET** /
|
|
82
|
-
*ExperimentsApi* | [**
|
|
83
|
-
*ExperimentsApi* | [**
|
|
77
|
+
*DatasetsApi* | [**datasets_create**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_create) | **POST** /v2/datasets | Create a new dataset with JSON examples
|
|
78
|
+
*DatasetsApi* | [**datasets_delete**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_delete) | **DELETE** /v2/datasets/{datasetId} | Delete a dataset by ID
|
|
79
|
+
*DatasetsApi* | [**datasets_get**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_get) | **GET** /v2/datasets/{datasetId} | Get dataset by ID
|
|
80
|
+
*DatasetsApi* | [**datasets_list**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_list) | **GET** /v2/datasets | List datasets the user has access to
|
|
81
|
+
*DatasetsApi* | [**datasets_list_examples**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_list_examples) | **GET** /v2/datasets/{datasetId}/examples | List examples for a dataset
|
|
82
|
+
*ExperimentsApi* | [**experiments_create**](arize/_generated/api_client/docs/ExperimentsApi.md#experiments_create) | **POST** /v2/experiments | Create a new experiment with run data
|
|
83
|
+
*ExperimentsApi* | [**experiments_delete**](arize/_generated/api_client/docs/ExperimentsApi.md#experiments_delete) | **DELETE** /v2/experiments/{experimentId} | Delete an experiment by ID
|
|
84
|
+
*ExperimentsApi* | [**experiments_get**](arize/_generated/api_client/docs/ExperimentsApi.md#experiments_get) | **GET** /v2/experiments/{experimentId} | Get experiment by ID
|
|
85
|
+
*ExperimentsApi* | [**experiments_list**](arize/_generated/api_client/docs/ExperimentsApi.md#experiments_list) | **GET** /v2/experiments | List experiments
|
|
86
|
+
*ExperimentsApi* | [**experiments_runs_list**](arize/_generated/api_client/docs/ExperimentsApi.md#experiments_runs_list) | **GET** /v2/experiments/{experimentId}/runs | List experiment runs for a specific experiment
|
|
84
87
|
|
|
85
88
|
|
|
86
89
|
## Documentation For Models
|
|
@@ -92,7 +95,9 @@ Class | Method | HTTP request | Description
|
|
|
92
95
|
- [DatasetsListExamples200Response](arize/_generated/api_client/docs/DatasetsListExamples200Response.md)
|
|
93
96
|
- [Error](arize/_generated/api_client/docs/Error.md)
|
|
94
97
|
- [Experiment](arize/_generated/api_client/docs/Experiment.md)
|
|
98
|
+
- [ExperimentsCreateRequest](arize/_generated/api_client/docs/ExperimentsCreateRequest.md)
|
|
95
99
|
- [ExperimentsList200Response](arize/_generated/api_client/docs/ExperimentsList200Response.md)
|
|
100
|
+
- [ExperimentsRunsList200Response](arize/_generated/api_client/docs/ExperimentsRunsList200Response.md)
|
|
96
101
|
|
|
97
102
|
|
|
98
103
|
<a id="documentation-for-authorization"></a>
|
arize/client.py
CHANGED
|
@@ -12,6 +12,10 @@ if TYPE_CHECKING:
|
|
|
12
12
|
from arize.spans.client import SpansClient
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
# TODO(Kiko): Protobuf versioning is too old
|
|
16
|
+
# TODO(Kiko): Make sure the client has same options as SDKConfiguration
|
|
17
|
+
# TODO(Kiko): It does not make any sense to require space ID in run_experiment, dataset ID should suffice
|
|
18
|
+
# TODO(Kiko): Should probably wrap every single method of gen clients so that we can add nice docstrings
|
|
15
19
|
# TODO(Kiko): Add flight max_chunksize opt to write_table. In config?
|
|
16
20
|
# TODO(Kiko): experimental/datasets must be adapted into the datasets subclient
|
|
17
21
|
# TODO(Kiko): experimental/prompt hub is missing
|
|
@@ -65,11 +69,21 @@ class ArizeClient(LazySubclientsMixin):
|
|
|
65
69
|
# Gate only the generated-backed ones
|
|
66
70
|
"datasets": (
|
|
67
71
|
"datasets-experiments",
|
|
68
|
-
(
|
|
72
|
+
(
|
|
73
|
+
"pydantic",
|
|
74
|
+
"openinference.semconv",
|
|
75
|
+
),
|
|
69
76
|
),
|
|
70
77
|
"experiments": (
|
|
71
78
|
"datasets-experiments",
|
|
72
|
-
(
|
|
79
|
+
(
|
|
80
|
+
"pydantic",
|
|
81
|
+
"wrapt",
|
|
82
|
+
# "numpy",
|
|
83
|
+
# "openinference.semconv",
|
|
84
|
+
# "opentelemetry.sdk",
|
|
85
|
+
# "opentelemetry.exporter.otlp.proto.grpc.trace_exporter",
|
|
86
|
+
),
|
|
73
87
|
),
|
|
74
88
|
"spans": (
|
|
75
89
|
"spans",
|
|
@@ -98,6 +112,7 @@ class ArizeClient(LazySubclientsMixin):
|
|
|
98
112
|
flight_server_host: str | None = None,
|
|
99
113
|
flight_server_port: int | None = None,
|
|
100
114
|
flight_scheme: str | None = None,
|
|
115
|
+
pyarrow_max_chunksize: int | None = None,
|
|
101
116
|
request_verify: bool | None = None,
|
|
102
117
|
stream_max_workers: int | None = None,
|
|
103
118
|
stream_max_queue_bound: int | None = None,
|
|
@@ -115,6 +130,8 @@ class ArizeClient(LazySubclientsMixin):
|
|
|
115
130
|
cfg_kwargs["flight_server_port"] = flight_server_port
|
|
116
131
|
if flight_scheme is not None:
|
|
117
132
|
cfg_kwargs["flight_scheme"] = flight_scheme
|
|
133
|
+
if pyarrow_max_chunksize is not None:
|
|
134
|
+
cfg_kwargs["pyarrow_max_chunksize"] = pyarrow_max_chunksize
|
|
118
135
|
if request_verify is not None:
|
|
119
136
|
cfg_kwargs["request_verify"] = request_verify
|
|
120
137
|
if stream_max_workers is not None:
|
arize/config.py
CHANGED
|
@@ -7,23 +7,30 @@ from typing import Any, Dict
|
|
|
7
7
|
|
|
8
8
|
from arize.constants.config import (
|
|
9
9
|
DEFAULT_API_HOST,
|
|
10
|
-
DEFAULT_API_INSECURE,
|
|
11
10
|
DEFAULT_FLIGHT_HOST,
|
|
12
11
|
DEFAULT_FLIGHT_PORT,
|
|
13
12
|
DEFAULT_FLIGHT_TRANSPORT_SCHEME,
|
|
13
|
+
DEFAULT_INSECURE,
|
|
14
|
+
DEFAULT_MAX_HTTP_PAYLOAD_SIZE_MB,
|
|
15
|
+
DEFAULT_OTLP_HOST,
|
|
16
|
+
DEFAULT_PYARROW_MAX_CHUNKSIZE,
|
|
14
17
|
DEFAULT_REQUEST_VERIFY,
|
|
15
18
|
DEFAULT_STREAM_MAX_QUEUE_BOUND,
|
|
16
19
|
DEFAULT_STREAM_MAX_WORKERS,
|
|
17
20
|
ENV_API_HOST,
|
|
18
|
-
ENV_API_INSECURE,
|
|
19
21
|
ENV_API_KEY,
|
|
20
22
|
ENV_FLIGHT_HOST,
|
|
21
23
|
ENV_FLIGHT_PORT,
|
|
22
24
|
ENV_FLIGHT_TRANSPORT_SCHEME,
|
|
25
|
+
ENV_INSECURE,
|
|
26
|
+
ENV_MAX_HTTP_PAYLOAD_SIZE_MB,
|
|
27
|
+
ENV_OTLP_HOST,
|
|
28
|
+
ENV_PYARROW_MAX_CHUNKSIZE,
|
|
23
29
|
ENV_REQUEST_VERIFY,
|
|
24
30
|
ENV_STREAM_MAX_QUEUE_BOUND,
|
|
25
31
|
ENV_STREAM_MAX_WORKERS,
|
|
26
32
|
)
|
|
33
|
+
from arize.constants.pyarrow import MAX_CHUNKSIZE
|
|
27
34
|
from arize.exceptions.auth import MissingAPIKeyError
|
|
28
35
|
from arize.version import __version__
|
|
29
36
|
|
|
@@ -45,7 +52,7 @@ def _api_host_factory() -> str:
|
|
|
45
52
|
|
|
46
53
|
|
|
47
54
|
def _api_scheme_factory() -> str:
|
|
48
|
-
insecure = os.getenv(
|
|
55
|
+
insecure = os.getenv(ENV_INSECURE, DEFAULT_INSECURE)
|
|
49
56
|
if insecure:
|
|
50
57
|
return "http"
|
|
51
58
|
return "https"
|
|
@@ -65,6 +72,17 @@ def _flight_scheme_factory() -> str:
|
|
|
65
72
|
)
|
|
66
73
|
|
|
67
74
|
|
|
75
|
+
def _pyarrow_max_chunksize() -> int:
|
|
76
|
+
max_chunksize = int(
|
|
77
|
+
os.getenv(ENV_PYARROW_MAX_CHUNKSIZE, DEFAULT_PYARROW_MAX_CHUNKSIZE)
|
|
78
|
+
)
|
|
79
|
+
if max_chunksize <= 0 or max_chunksize > MAX_CHUNKSIZE:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"Pyarrow max_chunksize must be between 1 and {MAX_CHUNKSIZE}, got {max_chunksize}"
|
|
82
|
+
)
|
|
83
|
+
return max_chunksize
|
|
84
|
+
|
|
85
|
+
|
|
68
86
|
def _verify_factory() -> bool:
|
|
69
87
|
return _parse_bool(os.getenv(ENV_REQUEST_VERIFY, DEFAULT_REQUEST_VERIFY))
|
|
70
88
|
|
|
@@ -79,6 +97,25 @@ def _stream_max_queue_bound_factory() -> int:
|
|
|
79
97
|
)
|
|
80
98
|
|
|
81
99
|
|
|
100
|
+
def _otlp_scheme_factory() -> str:
|
|
101
|
+
insecure = os.getenv(ENV_INSECURE, DEFAULT_INSECURE)
|
|
102
|
+
if insecure:
|
|
103
|
+
return "http"
|
|
104
|
+
return "https"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _otlp_host_factory() -> str:
|
|
108
|
+
return os.getenv(ENV_OTLP_HOST, DEFAULT_OTLP_HOST)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _max_http_payload_size_mb_factory() -> float:
|
|
112
|
+
return float(
|
|
113
|
+
os.getenv(
|
|
114
|
+
ENV_MAX_HTTP_PAYLOAD_SIZE_MB, DEFAULT_MAX_HTTP_PAYLOAD_SIZE_MB
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
82
119
|
def _mask_secret(secret: str, N: int = 4) -> str:
|
|
83
120
|
"""Show first N chars then '***'; empty string if empty."""
|
|
84
121
|
return f"{secret[:N]}***"
|
|
@@ -99,11 +136,17 @@ class SDKConfiguration:
|
|
|
99
136
|
flight_server_host: str = field(default_factory=_flight_host_factory)
|
|
100
137
|
flight_server_port: int = field(default_factory=_flight_port_factory)
|
|
101
138
|
flight_scheme: str = field(default_factory=_flight_scheme_factory)
|
|
139
|
+
pyarrow_max_chunksize: int = field(default_factory=_pyarrow_max_chunksize)
|
|
102
140
|
request_verify: bool = field(default_factory=_verify_factory)
|
|
103
141
|
stream_max_workers: int = field(default_factory=_stream_max_workers_factory)
|
|
104
142
|
stream_max_queue_bound: int = field(
|
|
105
143
|
default_factory=_stream_max_queue_bound_factory
|
|
106
144
|
)
|
|
145
|
+
otlp_host: str = field(default_factory=_otlp_host_factory)
|
|
146
|
+
otlp_scheme: str = field(default_factory=_otlp_scheme_factory)
|
|
147
|
+
max_http_payload_size_mb: float = field(
|
|
148
|
+
default_factory=_max_http_payload_size_mb_factory
|
|
149
|
+
)
|
|
107
150
|
|
|
108
151
|
# Private, excluded from comparisons & repr
|
|
109
152
|
_headers: Dict[str, str] = field(init=False, repr=False, compare=False)
|
|
@@ -121,6 +164,10 @@ class SDKConfiguration:
|
|
|
121
164
|
def api_url(self) -> str:
|
|
122
165
|
return _endpoint(self.api_scheme, self.api_host)
|
|
123
166
|
|
|
167
|
+
@property
|
|
168
|
+
def otlp_url(self) -> str:
|
|
169
|
+
return _endpoint(self.otlp_scheme, self.otlp_host, "/v1")
|
|
170
|
+
|
|
124
171
|
@property
|
|
125
172
|
def files_url(self) -> str:
|
|
126
173
|
return _endpoint(self.api_scheme, self.api_host, "/v1/pandas_arrow")
|
arize/constants/config.py
CHANGED
|
@@ -3,19 +3,25 @@ ENV_API_KEY = "ARIZE_API_KEY"
|
|
|
3
3
|
|
|
4
4
|
# Server configuration env vars
|
|
5
5
|
ENV_API_HOST = "ARIZE_API_HOST"
|
|
6
|
-
|
|
6
|
+
ENV_OTLP_HOST = "ARIZE_OTLP_HOST"
|
|
7
7
|
ENV_FLIGHT_HOST = "ARIZE_FLIGHT_HOST"
|
|
8
8
|
ENV_FLIGHT_PORT = "ARIZE_FLIGHT_PORT"
|
|
9
9
|
ENV_FLIGHT_TRANSPORT_SCHEME = "ARIZE_FLIGHT_TRANSPORT_SCHEME"
|
|
10
|
+
ENV_PYARROW_MAX_CHUNKSIZE = "ARIZE_MAX_CHUNKSIZE"
|
|
10
11
|
ENV_REQUEST_VERIFY = "ARIZE_REQUEST_VERIFY"
|
|
12
|
+
ENV_INSECURE = "ARIZE_INSECURE"
|
|
13
|
+
ENV_MAX_HTTP_PAYLOAD_SIZE_MB = "ARIZE_MAX_HTTP_PAYLOAD_SIZE_MB"
|
|
11
14
|
|
|
12
15
|
# Server configuration default values
|
|
13
16
|
DEFAULT_API_HOST = "api.arize.com" # NOTE: Must not prefix with https://
|
|
14
|
-
|
|
17
|
+
DEFAULT_OTLP_HOST = "otlp.arize.com" # NOTE: Must not prefix with https://
|
|
15
18
|
DEFAULT_FLIGHT_HOST = "flight.arize.com" # NOTE: Must not prefix with https://
|
|
16
19
|
DEFAULT_FLIGHT_PORT = 443
|
|
17
20
|
DEFAULT_FLIGHT_TRANSPORT_SCHEME = "grpc+tls"
|
|
21
|
+
DEFAULT_PYARROW_MAX_CHUNKSIZE = 10_000
|
|
18
22
|
DEFAULT_REQUEST_VERIFY = True
|
|
23
|
+
DEFAULT_INSECURE = False
|
|
24
|
+
DEFAULT_MAX_HTTP_PAYLOAD_SIZE_MB = 100
|
|
19
25
|
|
|
20
26
|
# ML Streaming configuration
|
|
21
27
|
ENV_STREAM_MAX_WORKERS = "ARIZE_STREAM_MAX_WORKERS"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import openinference.semconv.trace as oinf
|
|
2
|
+
|
|
3
|
+
OPEN_INFERENCE_JSON_STR_TYPES = frozenset(
|
|
4
|
+
[
|
|
5
|
+
oinf.DocumentAttributes.DOCUMENT_METADATA,
|
|
6
|
+
oinf.SpanAttributes.LLM_FUNCTION_CALL,
|
|
7
|
+
oinf.SpanAttributes.LLM_INVOCATION_PARAMETERS,
|
|
8
|
+
oinf.SpanAttributes.LLM_PROMPT_TEMPLATE_VARIABLES,
|
|
9
|
+
oinf.MessageAttributes.MESSAGE_FUNCTION_CALL_ARGUMENTS_JSON,
|
|
10
|
+
oinf.SpanAttributes.METADATA,
|
|
11
|
+
oinf.SpanAttributes.TOOL_PARAMETERS,
|
|
12
|
+
oinf.ToolCallAttributes.TOOL_CALL_FUNCTION_ARGUMENTS_JSON,
|
|
13
|
+
]
|
|
14
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
MAX_CHUNKSIZE = 100_000
|
arize/datasets/__init__.py
CHANGED
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
from collections.abc import Mapping
|
|
2
|
-
|
|
3
|
-
from arize._generated.api_client import models
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def make_to_df(field_name: str):
|
|
7
|
-
def to_df(
|
|
8
|
-
self,
|
|
9
|
-
by_alias: bool = False,
|
|
10
|
-
exclude_none: str | bool = False,
|
|
11
|
-
json_normalize: bool = False,
|
|
12
|
-
convert_dtypes: bool = True,
|
|
13
|
-
):
|
|
14
|
-
"""
|
|
15
|
-
Convert a list of objects to a pandas DataFrame.
|
|
16
|
-
|
|
17
|
-
Behavior:
|
|
18
|
-
- If an item is a Pydantic v2 model, use `.model_dump(by_alias=...)`.
|
|
19
|
-
- If an item is a mapping (dict-like), use it as-is.
|
|
20
|
-
- Otherwise, raise a ValueError (unsupported row type).
|
|
21
|
-
|
|
22
|
-
Parameters:
|
|
23
|
-
by_alias: Use field aliases when dumping Pydantic models.
|
|
24
|
-
exclude_none:
|
|
25
|
-
- False: keep Nones as-is
|
|
26
|
-
- "all": drop columns where *all* values are None/NaN
|
|
27
|
-
- "any": drop columns where *any* value is None/NaN
|
|
28
|
-
- True: alias for "all"
|
|
29
|
-
json_normalize: If True, flatten nested dicts via `pandas.json_normalize`.
|
|
30
|
-
convert_dtypes: If True, call `DataFrame.convert_dtypes()` at the end.
|
|
31
|
-
|
|
32
|
-
Returns:
|
|
33
|
-
pandas.DataFrame
|
|
34
|
-
"""
|
|
35
|
-
import pandas as pd
|
|
36
|
-
|
|
37
|
-
items = getattr(self, field_name, []) or []
|
|
38
|
-
|
|
39
|
-
rows = []
|
|
40
|
-
for it in items:
|
|
41
|
-
if hasattr(it, "model_dump"): # Pydantic v2 object
|
|
42
|
-
rows.append(it.model_dump(by_alias=by_alias))
|
|
43
|
-
|
|
44
|
-
elif isinstance(it, Mapping): # Plain mapping
|
|
45
|
-
rows.append(it)
|
|
46
|
-
else:
|
|
47
|
-
raise ValueError(
|
|
48
|
-
f"Cannot convert item of type {type(it)} to DataFrame row"
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
df = (
|
|
52
|
-
pd.json_normalize(rows, sep=".")
|
|
53
|
-
if json_normalize
|
|
54
|
-
else pd.DataFrame(rows)
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
# Drop None/NaN columns if requested
|
|
58
|
-
if exclude_none in ("any", "all", True):
|
|
59
|
-
drop_how = "all" if exclude_none is True else exclude_none
|
|
60
|
-
df.dropna(axis=1, how=drop_how, inplace=True)
|
|
61
|
-
|
|
62
|
-
if convert_dtypes:
|
|
63
|
-
df = df.convert_dtypes()
|
|
64
|
-
return df
|
|
65
|
-
|
|
66
|
-
return to_df
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
models.DatasetsList200Response.to_df = make_to_df("datasets") # type: ignore[attr-defined]
|
|
70
|
-
models.DatasetsListExamples200Response.to_df = make_to_df("examples") # type: ignore[attr-defined]
|
arize/datasets/client.py
CHANGED
|
@@ -1,18 +1,29 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import time
|
|
5
|
+
import uuid
|
|
4
6
|
from typing import Any, Dict, List
|
|
5
7
|
|
|
6
8
|
import pandas as pd
|
|
7
9
|
import pyarrow as pa
|
|
8
10
|
|
|
9
11
|
from arize._flight.client import ArizeFlightClient
|
|
12
|
+
from arize._generated.api_client import models
|
|
10
13
|
from arize.config import SDKConfiguration
|
|
14
|
+
from arize.datasets.validation import validate_dataset_df
|
|
11
15
|
from arize.exceptions.base import INVALID_ARROW_CONVERSION_MSG
|
|
16
|
+
from arize.utils.openinference_conversion import (
|
|
17
|
+
convert_boolean_columns_to_str,
|
|
18
|
+
convert_datetime_columns_to_int,
|
|
19
|
+
convert_default_columns_to_json_str,
|
|
20
|
+
)
|
|
21
|
+
from arize.utils.size import get_payload_size_mb
|
|
12
22
|
|
|
13
23
|
logger = logging.getLogger(__name__)
|
|
14
24
|
|
|
15
|
-
|
|
25
|
+
# TODO(Kiko): Decide based on size of payload instead
|
|
26
|
+
REST_LIMIT_DATASET_EXAMPLES = 0
|
|
16
27
|
|
|
17
28
|
|
|
18
29
|
class DatasetsClient:
|
|
@@ -29,11 +40,60 @@ class DatasetsClient:
|
|
|
29
40
|
self.list = self._api.datasets_list
|
|
30
41
|
self.get = self._api.datasets_get
|
|
31
42
|
self.delete = self._api.datasets_delete
|
|
32
|
-
self.list_examples = self._api.datasets_list_examples
|
|
33
43
|
|
|
34
44
|
# Custom methods
|
|
45
|
+
self.list_examples = self._list_examples
|
|
35
46
|
self.create = self._create_dataset
|
|
36
47
|
|
|
48
|
+
def _list_examples(
|
|
49
|
+
self,
|
|
50
|
+
dataset_id: str,
|
|
51
|
+
dataset_version_id: str = "",
|
|
52
|
+
limit: int = 100,
|
|
53
|
+
all: bool = False,
|
|
54
|
+
):
|
|
55
|
+
if not all:
|
|
56
|
+
return self._api.datasets_list_examples(
|
|
57
|
+
dataset_id=dataset_id,
|
|
58
|
+
dataset_version_id=dataset_version_id,
|
|
59
|
+
limit=limit,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# TODO(Kiko): Space ID should not be needed,
|
|
63
|
+
# should work on server tech debt to remove this
|
|
64
|
+
dataset = self.get(dataset_id=dataset_id)
|
|
65
|
+
space_id = dataset.space_id
|
|
66
|
+
|
|
67
|
+
with ArizeFlightClient(
|
|
68
|
+
api_key=self._sdk_config.api_key,
|
|
69
|
+
host=self._sdk_config.flight_server_host,
|
|
70
|
+
port=self._sdk_config.flight_server_port,
|
|
71
|
+
scheme=self._sdk_config.flight_scheme,
|
|
72
|
+
request_verify=self._sdk_config.request_verify,
|
|
73
|
+
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
74
|
+
) as flight_client:
|
|
75
|
+
try:
|
|
76
|
+
response = flight_client.get_dataset_examples(
|
|
77
|
+
space_id=space_id,
|
|
78
|
+
dataset_id=dataset_id,
|
|
79
|
+
dataset_version_id=dataset_version_id,
|
|
80
|
+
)
|
|
81
|
+
except Exception as e:
|
|
82
|
+
msg = f"Error during request: {str(e)}"
|
|
83
|
+
logger.error(msg)
|
|
84
|
+
raise RuntimeError(msg) from e
|
|
85
|
+
if response is None:
|
|
86
|
+
# This should not happen with proper Flight client implementation,
|
|
87
|
+
# but we handle it defensively
|
|
88
|
+
msg = "No response received from flight server during request"
|
|
89
|
+
logger.error(msg)
|
|
90
|
+
raise RuntimeError(msg)
|
|
91
|
+
# The response from flightserver is the dataset ID. To return the dataset
|
|
92
|
+
# object we make a GET query
|
|
93
|
+
return models.DatasetsListExamples200Response(
|
|
94
|
+
examples=response.to_dict(orient="records")
|
|
95
|
+
)
|
|
96
|
+
|
|
37
97
|
def _create_dataset(
|
|
38
98
|
self,
|
|
39
99
|
name: str,
|
|
@@ -45,7 +105,11 @@ class DatasetsClient:
|
|
|
45
105
|
raise TypeError(
|
|
46
106
|
"Examples must be a list of dicts or a pandas DataFrame"
|
|
47
107
|
)
|
|
48
|
-
|
|
108
|
+
below_threshold = (
|
|
109
|
+
get_payload_size_mb(examples)
|
|
110
|
+
<= self._sdk_config.max_http_payload_size_mb
|
|
111
|
+
)
|
|
112
|
+
if below_threshold or force_http:
|
|
49
113
|
from arize._generated import api_client as gen
|
|
50
114
|
|
|
51
115
|
data = (
|
|
@@ -69,7 +133,9 @@ class DatasetsClient:
|
|
|
69
133
|
"gRPC + Flight."
|
|
70
134
|
)
|
|
71
135
|
data = (
|
|
72
|
-
|
|
136
|
+
examples
|
|
137
|
+
if isinstance(examples, pd.DataFrame)
|
|
138
|
+
else pd.DataFrame(examples)
|
|
73
139
|
)
|
|
74
140
|
return self._create_dataset_via_flight(
|
|
75
141
|
name=name,
|
|
@@ -83,26 +149,21 @@ class DatasetsClient:
|
|
|
83
149
|
space_id: str,
|
|
84
150
|
examples: pd.DataFrame,
|
|
85
151
|
):
|
|
152
|
+
data = examples.copy()
|
|
86
153
|
# Convert datetime columns to int64 (ms since epoch)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
# Validator.validate_max_chunk_size(max_chunk_size)
|
|
96
|
-
# )
|
|
97
|
-
# if validation_errors:
|
|
98
|
-
# raise RuntimeError(
|
|
99
|
-
# [e.error_message() for e in validation_errors]
|
|
100
|
-
# )
|
|
154
|
+
data = convert_datetime_columns_to_int(data)
|
|
155
|
+
data = convert_boolean_columns_to_str(data)
|
|
156
|
+
data = _set_default_columns_for_dataset(data)
|
|
157
|
+
data = convert_default_columns_to_json_str(data)
|
|
158
|
+
|
|
159
|
+
validation_errors = validate_dataset_df(data)
|
|
160
|
+
if validation_errors:
|
|
161
|
+
raise RuntimeError([e.error_message() for e in validation_errors])
|
|
101
162
|
|
|
102
163
|
# Convert to Arrow table
|
|
103
164
|
try:
|
|
104
165
|
logger.debug("Converting data to Arrow format")
|
|
105
|
-
pa_table = pa.Table.from_pandas(
|
|
166
|
+
pa_table = pa.Table.from_pandas(data, preserve_index=False)
|
|
106
167
|
except pa.ArrowInvalid as e:
|
|
107
168
|
logger.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
|
|
108
169
|
raise pa.ArrowInvalid(
|
|
@@ -119,6 +180,7 @@ class DatasetsClient:
|
|
|
119
180
|
port=self._sdk_config.flight_server_port,
|
|
120
181
|
scheme=self._sdk_config.flight_scheme,
|
|
121
182
|
request_verify=self._sdk_config.request_verify,
|
|
183
|
+
max_chunksize=self._sdk_config.pyarrow_max_chunksize,
|
|
122
184
|
) as flight_client:
|
|
123
185
|
try:
|
|
124
186
|
response = flight_client.create_dataset(
|
|
@@ -140,3 +202,28 @@ class DatasetsClient:
|
|
|
140
202
|
# object we make a GET query
|
|
141
203
|
dataset = self.get(dataset_id=response)
|
|
142
204
|
return dataset
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _set_default_columns_for_dataset(df: pd.DataFrame) -> pd.DataFrame:
|
|
208
|
+
current_time = int(time.time() * 1000)
|
|
209
|
+
if "created_at" in df.columns:
|
|
210
|
+
if df["created_at"].isnull().values.any():
|
|
211
|
+
df["created_at"].fillna(current_time, inplace=True)
|
|
212
|
+
else:
|
|
213
|
+
df["created_at"] = current_time
|
|
214
|
+
|
|
215
|
+
if "updated_at" in df.columns:
|
|
216
|
+
if df["updated_at"].isnull().values.any():
|
|
217
|
+
df["updated_at"].fillna(current_time, inplace=True)
|
|
218
|
+
else:
|
|
219
|
+
df["updated_at"] = current_time
|
|
220
|
+
|
|
221
|
+
if "id" in df.columns:
|
|
222
|
+
if df["id"].isnull().values.any():
|
|
223
|
+
df["id"] = df["id"].apply(
|
|
224
|
+
lambda x: str(uuid.uuid4()) if pd.isnull(x) else x
|
|
225
|
+
)
|
|
226
|
+
else:
|
|
227
|
+
df["id"] = [str(uuid.uuid4()) for _ in range(len(df))]
|
|
228
|
+
|
|
229
|
+
return df
|
arize/datasets/errors.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DatasetError(Exception, ABC):
|
|
5
|
+
def __str__(self) -> str:
|
|
6
|
+
return self.error_message()
|
|
7
|
+
|
|
8
|
+
@abstractmethod
|
|
9
|
+
def __repr__(self) -> str:
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def error_message(self) -> str:
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class InvalidSessionError(DatasetError):
|
|
18
|
+
def error_message(self) -> str:
|
|
19
|
+
return (
|
|
20
|
+
"Credentials not provided or invalid. Please pass in the correct api_key when "
|
|
21
|
+
"initiating a new ArizeExportClient. Alternatively, you can set up credentials "
|
|
22
|
+
"in a profile or as an environment variable"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def __repr__(self) -> str:
|
|
26
|
+
return "InvalidSessionError()"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class InvalidConfigFileError(DatasetError):
|
|
30
|
+
def error_message(self) -> str:
|
|
31
|
+
return "Invalid/Misconfigured Configuration File"
|
|
32
|
+
|
|
33
|
+
def __repr__(self) -> str:
|
|
34
|
+
return "InvalidConfigFileError()"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class IDColumnUniqueConstraintError(DatasetError):
|
|
38
|
+
def error_message(self) -> str:
|
|
39
|
+
return "'id' column must contain unique values"
|
|
40
|
+
|
|
41
|
+
def __repr__(self) -> str:
|
|
42
|
+
return "IDColumnUniqueConstraintError()"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class RequiredColumnsError(DatasetError):
|
|
46
|
+
def __init__(self, missing_columns: set) -> None:
|
|
47
|
+
self.missing_columns = missing_columns
|
|
48
|
+
|
|
49
|
+
def error_message(self) -> str:
|
|
50
|
+
return f"Missing required columns: {self.missing_columns}"
|
|
51
|
+
|
|
52
|
+
def __repr__(self) -> str:
|
|
53
|
+
return f"RequiredColumnsError({self.missing_columns})"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class EmptyDatasetError(DatasetError):
|
|
57
|
+
def error_message(self) -> str:
|
|
58
|
+
return "DataFrame must have at least one row in it."
|
|
59
|
+
|
|
60
|
+
def __repr__(self) -> str:
|
|
61
|
+
return "EmptyDatasetError()"
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from arize.datasets import errors as err
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def validate_dataset_df(
|
|
9
|
+
df: pd.DataFrame,
|
|
10
|
+
) -> List[err.DatasetError]:
|
|
11
|
+
## check all require columns are present
|
|
12
|
+
required_columns_errors = _check_required_columns(df)
|
|
13
|
+
if required_columns_errors:
|
|
14
|
+
return required_columns_errors
|
|
15
|
+
|
|
16
|
+
## check id column is unique
|
|
17
|
+
id_column_unique_constraint_error = _check_id_column_is_unique(df)
|
|
18
|
+
if id_column_unique_constraint_error:
|
|
19
|
+
return id_column_unique_constraint_error
|
|
20
|
+
|
|
21
|
+
# check DataFrame has at least one row in it
|
|
22
|
+
emtpy_dataframe_error = _check_empty_dataframe(df)
|
|
23
|
+
if emtpy_dataframe_error:
|
|
24
|
+
return emtpy_dataframe_error
|
|
25
|
+
|
|
26
|
+
return []
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _check_required_columns(df: pd.DataFrame) -> List[err.DatasetError]:
|
|
30
|
+
required_columns = ["id", "created_at", "updated_at"]
|
|
31
|
+
missing_columns = set(required_columns) - set(df.columns)
|
|
32
|
+
if missing_columns:
|
|
33
|
+
return [err.RequiredColumnsError(missing_columns)]
|
|
34
|
+
return []
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _check_id_column_is_unique(df: pd.DataFrame) -> List[err.DatasetError]:
|
|
38
|
+
if not df["id"].is_unique:
|
|
39
|
+
return [err.IDColumnUniqueConstraintError()]
|
|
40
|
+
return []
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _check_empty_dataframe(df: pd.DataFrame) -> List[err.DatasetError]:
|
|
44
|
+
if df.empty:
|
|
45
|
+
return [err.EmptyDatasetError()]
|
|
46
|
+
return []
|