arize 8.0.0a14__py3-none-any.whl → 8.0.0a16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. arize/__init__.py +70 -1
  2. arize/_flight/client.py +163 -43
  3. arize/_flight/types.py +1 -0
  4. arize/_generated/api_client/__init__.py +5 -1
  5. arize/_generated/api_client/api/datasets_api.py +6 -6
  6. arize/_generated/api_client/api/experiments_api.py +924 -61
  7. arize/_generated/api_client/api_client.py +1 -1
  8. arize/_generated/api_client/configuration.py +1 -1
  9. arize/_generated/api_client/exceptions.py +1 -1
  10. arize/_generated/api_client/models/__init__.py +3 -1
  11. arize/_generated/api_client/models/dataset.py +2 -2
  12. arize/_generated/api_client/models/dataset_version.py +1 -1
  13. arize/_generated/api_client/models/datasets_create_request.py +3 -3
  14. arize/_generated/api_client/models/datasets_list200_response.py +1 -1
  15. arize/_generated/api_client/models/datasets_list_examples200_response.py +1 -1
  16. arize/_generated/api_client/models/error.py +1 -1
  17. arize/_generated/api_client/models/experiment.py +6 -6
  18. arize/_generated/api_client/models/experiments_create_request.py +98 -0
  19. arize/_generated/api_client/models/experiments_list200_response.py +1 -1
  20. arize/_generated/api_client/models/experiments_runs_list200_response.py +92 -0
  21. arize/_generated/api_client/rest.py +1 -1
  22. arize/_generated/api_client/test/test_dataset.py +2 -1
  23. arize/_generated/api_client/test/test_dataset_version.py +1 -1
  24. arize/_generated/api_client/test/test_datasets_api.py +1 -1
  25. arize/_generated/api_client/test/test_datasets_create_request.py +2 -1
  26. arize/_generated/api_client/test/test_datasets_list200_response.py +1 -1
  27. arize/_generated/api_client/test/test_datasets_list_examples200_response.py +1 -1
  28. arize/_generated/api_client/test/test_error.py +1 -1
  29. arize/_generated/api_client/test/test_experiment.py +6 -1
  30. arize/_generated/api_client/test/test_experiments_api.py +23 -2
  31. arize/_generated/api_client/test/test_experiments_create_request.py +61 -0
  32. arize/_generated/api_client/test/test_experiments_list200_response.py +1 -1
  33. arize/_generated/api_client/test/test_experiments_runs_list200_response.py +56 -0
  34. arize/_generated/api_client_README.md +13 -8
  35. arize/client.py +19 -2
  36. arize/config.py +50 -3
  37. arize/constants/config.py +8 -2
  38. arize/constants/openinference.py +14 -0
  39. arize/constants/pyarrow.py +1 -0
  40. arize/datasets/__init__.py +0 -70
  41. arize/datasets/client.py +106 -19
  42. arize/datasets/errors.py +61 -0
  43. arize/datasets/validation.py +46 -0
  44. arize/experiments/client.py +455 -0
  45. arize/experiments/evaluators/__init__.py +0 -0
  46. arize/experiments/evaluators/base.py +255 -0
  47. arize/experiments/evaluators/exceptions.py +10 -0
  48. arize/experiments/evaluators/executors.py +502 -0
  49. arize/experiments/evaluators/rate_limiters.py +277 -0
  50. arize/experiments/evaluators/types.py +122 -0
  51. arize/experiments/evaluators/utils.py +198 -0
  52. arize/experiments/functions.py +920 -0
  53. arize/experiments/tracing.py +276 -0
  54. arize/experiments/types.py +394 -0
  55. arize/models/client.py +4 -1
  56. arize/spans/client.py +16 -20
  57. arize/utils/arrow.py +4 -3
  58. arize/utils/openinference_conversion.py +56 -0
  59. arize/utils/proto.py +13 -0
  60. arize/utils/size.py +22 -0
  61. arize/version.py +1 -1
  62. {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/METADATA +3 -1
  63. {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/RECORD +65 -44
  64. {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/WHEEL +0 -0
  65. {arize-8.0.0a14.dist-info → arize-8.0.0a16.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,56 @@
1
+ # coding: utf-8
2
+
3
+ """
4
+ Arize REST API
5
+
6
+ API specification for the backend data server. The API is hosted globally at https://api.arize.com/v2 or in your own environment. You can access the OpenAPI spec for this API at https://api.arize.com/v2/spec.yaml
7
+
8
+ The version of the OpenAPI document: 0.0.1
9
+ Generated by OpenAPI Generator (https://openapi-generator.tech)
10
+
11
+ Do not edit the class manually.
12
+ """ # noqa: E501
13
+
14
+
15
+ import unittest
16
+
17
+ from arize._generated.api_client.models.experiments_runs_list200_response import ExperimentsRunsList200Response
18
+
19
+ class TestExperimentsRunsList200Response(unittest.TestCase):
20
+ """ExperimentsRunsList200Response unit test stubs"""
21
+
22
+ def setUp(self):
23
+ pass
24
+
25
+ def tearDown(self):
26
+ pass
27
+
28
+ def make_instance(self, include_optional) -> ExperimentsRunsList200Response:
29
+ """Test ExperimentsRunsList200Response
30
+ include_optional is a boolean, when False only required
31
+ params are included, when True both required and
32
+ optional params are included """
33
+ # uncomment below to create an instance of `ExperimentsRunsList200Response`
34
+ """
35
+ model = ExperimentsRunsList200Response()
36
+ if include_optional:
37
+ return ExperimentsRunsList200Response(
38
+ experiment_runs = [
39
+ { }
40
+ ]
41
+ )
42
+ else:
43
+ return ExperimentsRunsList200Response(
44
+ experiment_runs = [
45
+ { }
46
+ ],
47
+ )
48
+ """
49
+
50
+ def testExperimentsRunsList200Response(self):
51
+ """Test ExperimentsRunsList200Response"""
52
+ # inst_req_only = self.make_instance(include_optional=False)
53
+ # inst_req_and_optional = self.make_instance(include_optional=True)
54
+
55
+ if __name__ == '__main__':
56
+ unittest.main()
@@ -1,5 +1,5 @@
1
1
  # Arize
2
- API specification for the backend data server. The API is hosted globally at https://app.arize.com/api/v1 or in your own environment. You can access the OpenAPI spec for this API at https://app.arize.com/api/v1/spec.yaml
2
+ API specification for the backend data server. The API is hosted globally at https://api.arize.com/v2 or in your own environment. You can access the OpenAPI spec for this API at https://api.arize.com/v2/spec.yaml
3
3
 
4
4
  The `arize._generated.api_client` package is automatically generated by the [OpenAPI Generator](https://openapi-generator.tech) project:
5
5
 
@@ -74,13 +74,16 @@ All URIs are relative to *http://localhost*
74
74
 
75
75
  Class | Method | HTTP request | Description
76
76
  ------------ | ------------- | ------------- | -------------
77
- *DatasetsApi* | [**datasets_create**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_create) | **POST** /api/v1/datasets | Create a new dataset with JSON examples
78
- *DatasetsApi* | [**datasets_delete**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_delete) | **DELETE** /api/v1/datasets/{datasetId} | Delete a dataset by ID
79
- *DatasetsApi* | [**datasets_get**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_get) | **GET** /api/v1/datasets/{datasetId} | Get dataset by ID
80
- *DatasetsApi* | [**datasets_list**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_list) | **GET** /api/v1/datasets | List datasets the user has access to
81
- *DatasetsApi* | [**datasets_list_examples**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_list_examples) | **GET** /api/v1/datasets/{datasetId}/examples | List examples for a dataset
82
- *ExperimentsApi* | [**experiments_delete**](arize/_generated/api_client/docs/ExperimentsApi.md#experiments_delete) | **DELETE** /api/v1/experiments/{experimentId} | Delete an experiment by ID
83
- *ExperimentsApi* | [**experiments_list**](arize/_generated/api_client/docs/ExperimentsApi.md#experiments_list) | **GET** /api/v1/datasets/{datasetId}/experiments | List experiments for a given dataset
77
+ *DatasetsApi* | [**datasets_create**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_create) | **POST** /v2/datasets | Create a new dataset with JSON examples
78
+ *DatasetsApi* | [**datasets_delete**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_delete) | **DELETE** /v2/datasets/{datasetId} | Delete a dataset by ID
79
+ *DatasetsApi* | [**datasets_get**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_get) | **GET** /v2/datasets/{datasetId} | Get dataset by ID
80
+ *DatasetsApi* | [**datasets_list**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_list) | **GET** /v2/datasets | List datasets the user has access to
81
+ *DatasetsApi* | [**datasets_list_examples**](arize/_generated/api_client/docs/DatasetsApi.md#datasets_list_examples) | **GET** /v2/datasets/{datasetId}/examples | List examples for a dataset
82
+ *ExperimentsApi* | [**experiments_create**](arize/_generated/api_client/docs/ExperimentsApi.md#experiments_create) | **POST** /v2/experiments | Create a new experiment with run data
83
+ *ExperimentsApi* | [**experiments_delete**](arize/_generated/api_client/docs/ExperimentsApi.md#experiments_delete) | **DELETE** /v2/experiments/{experimentId} | Delete an experiment by ID
84
+ *ExperimentsApi* | [**experiments_get**](arize/_generated/api_client/docs/ExperimentsApi.md#experiments_get) | **GET** /v2/experiments/{experimentId} | Get experiment by ID
85
+ *ExperimentsApi* | [**experiments_list**](arize/_generated/api_client/docs/ExperimentsApi.md#experiments_list) | **GET** /v2/experiments | List experiments
86
+ *ExperimentsApi* | [**experiments_runs_list**](arize/_generated/api_client/docs/ExperimentsApi.md#experiments_runs_list) | **GET** /v2/experiments/{experimentId}/runs | List experiment runs for a specific experiment
84
87
 
85
88
 
86
89
  ## Documentation For Models
@@ -92,7 +95,9 @@ Class | Method | HTTP request | Description
92
95
  - [DatasetsListExamples200Response](arize/_generated/api_client/docs/DatasetsListExamples200Response.md)
93
96
  - [Error](arize/_generated/api_client/docs/Error.md)
94
97
  - [Experiment](arize/_generated/api_client/docs/Experiment.md)
98
+ - [ExperimentsCreateRequest](arize/_generated/api_client/docs/ExperimentsCreateRequest.md)
95
99
  - [ExperimentsList200Response](arize/_generated/api_client/docs/ExperimentsList200Response.md)
100
+ - [ExperimentsRunsList200Response](arize/_generated/api_client/docs/ExperimentsRunsList200Response.md)
96
101
 
97
102
 
98
103
  <a id="documentation-for-authorization"></a>
arize/client.py CHANGED
@@ -12,6 +12,10 @@ if TYPE_CHECKING:
12
12
  from arize.spans.client import SpansClient
13
13
 
14
14
 
15
+ # TODO(Kiko): Protobuf versioning is too old
16
+ # TODO(Kiko): Make sure the client has same options as SDKConfiguration
17
+ # TODO(Kiko): It does not make any sense to require space ID in run_experiment, dataset ID should suffice
18
+ # TODO(Kiko): Should probably wrap every single method of gen clients so that we can add nice docstrings
15
19
  # TODO(Kiko): Add flight max_chunksize opt to write_table. In config?
16
20
  # TODO(Kiko): experimental/datasets must be adapted into the datasets subclient
17
21
  # TODO(Kiko): experimental/prompt hub is missing
@@ -65,11 +69,21 @@ class ArizeClient(LazySubclientsMixin):
65
69
  # Gate only the generated-backed ones
66
70
  "datasets": (
67
71
  "datasets-experiments",
68
- ("pydantic",),
72
+ (
73
+ "pydantic",
74
+ "openinference.semconv",
75
+ ),
69
76
  ),
70
77
  "experiments": (
71
78
  "datasets-experiments",
72
- ("pydantic",),
79
+ (
80
+ "pydantic",
81
+ "wrapt",
82
+ # "numpy",
83
+ # "openinference.semconv",
84
+ # "opentelemetry.sdk",
85
+ # "opentelemetry.exporter.otlp.proto.grpc.trace_exporter",
86
+ ),
73
87
  ),
74
88
  "spans": (
75
89
  "spans",
@@ -98,6 +112,7 @@ class ArizeClient(LazySubclientsMixin):
98
112
  flight_server_host: str | None = None,
99
113
  flight_server_port: int | None = None,
100
114
  flight_scheme: str | None = None,
115
+ pyarrow_max_chunksize: int | None = None,
101
116
  request_verify: bool | None = None,
102
117
  stream_max_workers: int | None = None,
103
118
  stream_max_queue_bound: int | None = None,
@@ -115,6 +130,8 @@ class ArizeClient(LazySubclientsMixin):
115
130
  cfg_kwargs["flight_server_port"] = flight_server_port
116
131
  if flight_scheme is not None:
117
132
  cfg_kwargs["flight_scheme"] = flight_scheme
133
+ if pyarrow_max_chunksize is not None:
134
+ cfg_kwargs["pyarrow_max_chunksize"] = pyarrow_max_chunksize
118
135
  if request_verify is not None:
119
136
  cfg_kwargs["request_verify"] = request_verify
120
137
  if stream_max_workers is not None:
arize/config.py CHANGED
@@ -7,23 +7,30 @@ from typing import Any, Dict
7
7
 
8
8
  from arize.constants.config import (
9
9
  DEFAULT_API_HOST,
10
- DEFAULT_API_INSECURE,
11
10
  DEFAULT_FLIGHT_HOST,
12
11
  DEFAULT_FLIGHT_PORT,
13
12
  DEFAULT_FLIGHT_TRANSPORT_SCHEME,
13
+ DEFAULT_INSECURE,
14
+ DEFAULT_MAX_HTTP_PAYLOAD_SIZE_MB,
15
+ DEFAULT_OTLP_HOST,
16
+ DEFAULT_PYARROW_MAX_CHUNKSIZE,
14
17
  DEFAULT_REQUEST_VERIFY,
15
18
  DEFAULT_STREAM_MAX_QUEUE_BOUND,
16
19
  DEFAULT_STREAM_MAX_WORKERS,
17
20
  ENV_API_HOST,
18
- ENV_API_INSECURE,
19
21
  ENV_API_KEY,
20
22
  ENV_FLIGHT_HOST,
21
23
  ENV_FLIGHT_PORT,
22
24
  ENV_FLIGHT_TRANSPORT_SCHEME,
25
+ ENV_INSECURE,
26
+ ENV_MAX_HTTP_PAYLOAD_SIZE_MB,
27
+ ENV_OTLP_HOST,
28
+ ENV_PYARROW_MAX_CHUNKSIZE,
23
29
  ENV_REQUEST_VERIFY,
24
30
  ENV_STREAM_MAX_QUEUE_BOUND,
25
31
  ENV_STREAM_MAX_WORKERS,
26
32
  )
33
+ from arize.constants.pyarrow import MAX_CHUNKSIZE
27
34
  from arize.exceptions.auth import MissingAPIKeyError
28
35
  from arize.version import __version__
29
36
 
@@ -45,7 +52,7 @@ def _api_host_factory() -> str:
45
52
 
46
53
 
47
54
  def _api_scheme_factory() -> str:
48
- insecure = os.getenv(ENV_API_INSECURE, DEFAULT_API_INSECURE)
55
+ insecure = os.getenv(ENV_INSECURE, DEFAULT_INSECURE)
49
56
  if insecure:
50
57
  return "http"
51
58
  return "https"
@@ -65,6 +72,17 @@ def _flight_scheme_factory() -> str:
65
72
  )
66
73
 
67
74
 
75
+ def _pyarrow_max_chunksize() -> int:
76
+ max_chunksize = int(
77
+ os.getenv(ENV_PYARROW_MAX_CHUNKSIZE, DEFAULT_PYARROW_MAX_CHUNKSIZE)
78
+ )
79
+ if max_chunksize <= 0 or max_chunksize > MAX_CHUNKSIZE:
80
+ raise ValueError(
81
+ f"Pyarrow max_chunksize must be between 1 and {MAX_CHUNKSIZE}, got {max_chunksize}"
82
+ )
83
+ return max_chunksize
84
+
85
+
68
86
  def _verify_factory() -> bool:
69
87
  return _parse_bool(os.getenv(ENV_REQUEST_VERIFY, DEFAULT_REQUEST_VERIFY))
70
88
 
@@ -79,6 +97,25 @@ def _stream_max_queue_bound_factory() -> int:
79
97
  )
80
98
 
81
99
 
100
+ def _otlp_scheme_factory() -> str:
101
+ insecure = os.getenv(ENV_INSECURE, DEFAULT_INSECURE)
102
+ if insecure:
103
+ return "http"
104
+ return "https"
105
+
106
+
107
+ def _otlp_host_factory() -> str:
108
+ return os.getenv(ENV_OTLP_HOST, DEFAULT_OTLP_HOST)
109
+
110
+
111
+ def _max_http_payload_size_mb_factory() -> float:
112
+ return float(
113
+ os.getenv(
114
+ ENV_MAX_HTTP_PAYLOAD_SIZE_MB, DEFAULT_MAX_HTTP_PAYLOAD_SIZE_MB
115
+ )
116
+ )
117
+
118
+
82
119
  def _mask_secret(secret: str, N: int = 4) -> str:
83
120
  """Show first N chars then '***'; empty string if empty."""
84
121
  return f"{secret[:N]}***"
@@ -99,11 +136,17 @@ class SDKConfiguration:
99
136
  flight_server_host: str = field(default_factory=_flight_host_factory)
100
137
  flight_server_port: int = field(default_factory=_flight_port_factory)
101
138
  flight_scheme: str = field(default_factory=_flight_scheme_factory)
139
+ pyarrow_max_chunksize: int = field(default_factory=_pyarrow_max_chunksize)
102
140
  request_verify: bool = field(default_factory=_verify_factory)
103
141
  stream_max_workers: int = field(default_factory=_stream_max_workers_factory)
104
142
  stream_max_queue_bound: int = field(
105
143
  default_factory=_stream_max_queue_bound_factory
106
144
  )
145
+ otlp_host: str = field(default_factory=_otlp_host_factory)
146
+ otlp_scheme: str = field(default_factory=_otlp_scheme_factory)
147
+ max_http_payload_size_mb: float = field(
148
+ default_factory=_max_http_payload_size_mb_factory
149
+ )
107
150
 
108
151
  # Private, excluded from comparisons & repr
109
152
  _headers: Dict[str, str] = field(init=False, repr=False, compare=False)
@@ -121,6 +164,10 @@ class SDKConfiguration:
121
164
  def api_url(self) -> str:
122
165
  return _endpoint(self.api_scheme, self.api_host)
123
166
 
167
+ @property
168
+ def otlp_url(self) -> str:
169
+ return _endpoint(self.otlp_scheme, self.otlp_host, "/v1")
170
+
124
171
  @property
125
172
  def files_url(self) -> str:
126
173
  return _endpoint(self.api_scheme, self.api_host, "/v1/pandas_arrow")
arize/constants/config.py CHANGED
@@ -3,19 +3,25 @@ ENV_API_KEY = "ARIZE_API_KEY"
3
3
 
4
4
  # Server configuration env vars
5
5
  ENV_API_HOST = "ARIZE_API_HOST"
6
- ENV_API_INSECURE = "ARIZE_API_INSECURE"
6
+ ENV_OTLP_HOST = "ARIZE_OTLP_HOST"
7
7
  ENV_FLIGHT_HOST = "ARIZE_FLIGHT_HOST"
8
8
  ENV_FLIGHT_PORT = "ARIZE_FLIGHT_PORT"
9
9
  ENV_FLIGHT_TRANSPORT_SCHEME = "ARIZE_FLIGHT_TRANSPORT_SCHEME"
10
+ ENV_PYARROW_MAX_CHUNKSIZE = "ARIZE_MAX_CHUNKSIZE"
10
11
  ENV_REQUEST_VERIFY = "ARIZE_REQUEST_VERIFY"
12
+ ENV_INSECURE = "ARIZE_INSECURE"
13
+ ENV_MAX_HTTP_PAYLOAD_SIZE_MB = "ARIZE_MAX_HTTP_PAYLOAD_SIZE_MB"
11
14
 
12
15
  # Server configuration default values
13
16
  DEFAULT_API_HOST = "api.arize.com" # NOTE: Must not prefix with https://
14
- DEFAULT_API_INSECURE = False
17
+ DEFAULT_OTLP_HOST = "otlp.arize.com" # NOTE: Must not prefix with https://
15
18
  DEFAULT_FLIGHT_HOST = "flight.arize.com" # NOTE: Must not prefix with https://
16
19
  DEFAULT_FLIGHT_PORT = 443
17
20
  DEFAULT_FLIGHT_TRANSPORT_SCHEME = "grpc+tls"
21
+ DEFAULT_PYARROW_MAX_CHUNKSIZE = 10_000
18
22
  DEFAULT_REQUEST_VERIFY = True
23
+ DEFAULT_INSECURE = False
24
+ DEFAULT_MAX_HTTP_PAYLOAD_SIZE_MB = 100
19
25
 
20
26
  # ML Streaming configuration
21
27
  ENV_STREAM_MAX_WORKERS = "ARIZE_STREAM_MAX_WORKERS"
@@ -0,0 +1,14 @@
1
+ import openinference.semconv.trace as oinf
2
+
3
+ OPEN_INFERENCE_JSON_STR_TYPES = frozenset(
4
+ [
5
+ oinf.DocumentAttributes.DOCUMENT_METADATA,
6
+ oinf.SpanAttributes.LLM_FUNCTION_CALL,
7
+ oinf.SpanAttributes.LLM_INVOCATION_PARAMETERS,
8
+ oinf.SpanAttributes.LLM_PROMPT_TEMPLATE_VARIABLES,
9
+ oinf.MessageAttributes.MESSAGE_FUNCTION_CALL_ARGUMENTS_JSON,
10
+ oinf.SpanAttributes.METADATA,
11
+ oinf.SpanAttributes.TOOL_PARAMETERS,
12
+ oinf.ToolCallAttributes.TOOL_CALL_FUNCTION_ARGUMENTS_JSON,
13
+ ]
14
+ )
@@ -0,0 +1 @@
1
+ MAX_CHUNKSIZE = 100_000
@@ -1,70 +0,0 @@
1
- from collections.abc import Mapping
2
-
3
- from arize._generated.api_client import models
4
-
5
-
6
- def make_to_df(field_name: str):
7
- def to_df(
8
- self,
9
- by_alias: bool = False,
10
- exclude_none: str | bool = False,
11
- json_normalize: bool = False,
12
- convert_dtypes: bool = True,
13
- ):
14
- """
15
- Convert a list of objects to a pandas DataFrame.
16
-
17
- Behavior:
18
- - If an item is a Pydantic v2 model, use `.model_dump(by_alias=...)`.
19
- - If an item is a mapping (dict-like), use it as-is.
20
- - Otherwise, raise a ValueError (unsupported row type).
21
-
22
- Parameters:
23
- by_alias: Use field aliases when dumping Pydantic models.
24
- exclude_none:
25
- - False: keep Nones as-is
26
- - "all": drop columns where *all* values are None/NaN
27
- - "any": drop columns where *any* value is None/NaN
28
- - True: alias for "all"
29
- json_normalize: If True, flatten nested dicts via `pandas.json_normalize`.
30
- convert_dtypes: If True, call `DataFrame.convert_dtypes()` at the end.
31
-
32
- Returns:
33
- pandas.DataFrame
34
- """
35
- import pandas as pd
36
-
37
- items = getattr(self, field_name, []) or []
38
-
39
- rows = []
40
- for it in items:
41
- if hasattr(it, "model_dump"): # Pydantic v2 object
42
- rows.append(it.model_dump(by_alias=by_alias))
43
-
44
- elif isinstance(it, Mapping): # Plain mapping
45
- rows.append(it)
46
- else:
47
- raise ValueError(
48
- f"Cannot convert item of type {type(it)} to DataFrame row"
49
- )
50
-
51
- df = (
52
- pd.json_normalize(rows, sep=".")
53
- if json_normalize
54
- else pd.DataFrame(rows)
55
- )
56
-
57
- # Drop None/NaN columns if requested
58
- if exclude_none in ("any", "all", True):
59
- drop_how = "all" if exclude_none is True else exclude_none
60
- df.dropna(axis=1, how=drop_how, inplace=True)
61
-
62
- if convert_dtypes:
63
- df = df.convert_dtypes()
64
- return df
65
-
66
- return to_df
67
-
68
-
69
- models.DatasetsList200Response.to_df = make_to_df("datasets") # type: ignore[attr-defined]
70
- models.DatasetsListExamples200Response.to_df = make_to_df("examples") # type: ignore[attr-defined]
arize/datasets/client.py CHANGED
@@ -1,18 +1,29 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ import time
5
+ import uuid
4
6
  from typing import Any, Dict, List
5
7
 
6
8
  import pandas as pd
7
9
  import pyarrow as pa
8
10
 
9
11
  from arize._flight.client import ArizeFlightClient
12
+ from arize._generated.api_client import models
10
13
  from arize.config import SDKConfiguration
14
+ from arize.datasets.validation import validate_dataset_df
11
15
  from arize.exceptions.base import INVALID_ARROW_CONVERSION_MSG
16
+ from arize.utils.openinference_conversion import (
17
+ convert_boolean_columns_to_str,
18
+ convert_datetime_columns_to_int,
19
+ convert_default_columns_to_json_str,
20
+ )
21
+ from arize.utils.size import get_payload_size_mb
12
22
 
13
23
  logger = logging.getLogger(__name__)
14
24
 
15
- REST_LIMIT_DATASET_EXAMPLES = 3
25
+ # TODO(Kiko): Decide based on size of payload instead
26
+ REST_LIMIT_DATASET_EXAMPLES = 0
16
27
 
17
28
 
18
29
  class DatasetsClient:
@@ -29,11 +40,60 @@ class DatasetsClient:
29
40
  self.list = self._api.datasets_list
30
41
  self.get = self._api.datasets_get
31
42
  self.delete = self._api.datasets_delete
32
- self.list_examples = self._api.datasets_list_examples
33
43
 
34
44
  # Custom methods
45
+ self.list_examples = self._list_examples
35
46
  self.create = self._create_dataset
36
47
 
48
+ def _list_examples(
49
+ self,
50
+ dataset_id: str,
51
+ dataset_version_id: str = "",
52
+ limit: int = 100,
53
+ all: bool = False,
54
+ ):
55
+ if not all:
56
+ return self._api.datasets_list_examples(
57
+ dataset_id=dataset_id,
58
+ dataset_version_id=dataset_version_id,
59
+ limit=limit,
60
+ )
61
+
62
+ # TODO(Kiko): Space ID should not be needed,
63
+ # should work on server tech debt to remove this
64
+ dataset = self.get(dataset_id=dataset_id)
65
+ space_id = dataset.space_id
66
+
67
+ with ArizeFlightClient(
68
+ api_key=self._sdk_config.api_key,
69
+ host=self._sdk_config.flight_server_host,
70
+ port=self._sdk_config.flight_server_port,
71
+ scheme=self._sdk_config.flight_scheme,
72
+ request_verify=self._sdk_config.request_verify,
73
+ max_chunksize=self._sdk_config.pyarrow_max_chunksize,
74
+ ) as flight_client:
75
+ try:
76
+ response = flight_client.get_dataset_examples(
77
+ space_id=space_id,
78
+ dataset_id=dataset_id,
79
+ dataset_version_id=dataset_version_id,
80
+ )
81
+ except Exception as e:
82
+ msg = f"Error during request: {str(e)}"
83
+ logger.error(msg)
84
+ raise RuntimeError(msg) from e
85
+ if response is None:
86
+ # This should not happen with proper Flight client implementation,
87
+ # but we handle it defensively
88
+ msg = "No response received from flight server during request"
89
+ logger.error(msg)
90
+ raise RuntimeError(msg)
91
+ # The response from flightserver is the dataset ID. To return the dataset
92
+ # object we make a GET query
93
+ return models.DatasetsListExamples200Response(
94
+ examples=response.to_dict(orient="records")
95
+ )
96
+
37
97
  def _create_dataset(
38
98
  self,
39
99
  name: str,
@@ -45,7 +105,11 @@ class DatasetsClient:
45
105
  raise TypeError(
46
106
  "Examples must be a list of dicts or a pandas DataFrame"
47
107
  )
48
- if len(examples) <= REST_LIMIT_DATASET_EXAMPLES or force_http:
108
+ below_threshold = (
109
+ get_payload_size_mb(examples)
110
+ <= self._sdk_config.max_http_payload_size_mb
111
+ )
112
+ if below_threshold or force_http:
49
113
  from arize._generated import api_client as gen
50
114
 
51
115
  data = (
@@ -69,7 +133,9 @@ class DatasetsClient:
69
133
  "gRPC + Flight."
70
134
  )
71
135
  data = (
72
- pd.DataFrame(examples) if isinstance(examples, list) else examples
136
+ examples
137
+ if isinstance(examples, pd.DataFrame)
138
+ else pd.DataFrame(examples)
73
139
  )
74
140
  return self._create_dataset_via_flight(
75
141
  name=name,
@@ -83,26 +149,21 @@ class DatasetsClient:
83
149
  space_id: str,
84
150
  examples: pd.DataFrame,
85
151
  ):
152
+ data = examples.copy()
86
153
  # Convert datetime columns to int64 (ms since epoch)
87
- # TODO(Kiko): Missing validation block
88
- # data = _convert_datetime_columns_to_int(data)
89
- # df = self._set_default_columns_for_dataset(data)
90
- # if convert_dict_to_json:
91
- # df = _convert_default_columns_to_json_str(df)
92
- # df = _convert_boolean_columns_to_str(df)
93
- # validation_errors = Validator.validate(df)
94
- # validation_errors.extend(
95
- # Validator.validate_max_chunk_size(max_chunk_size)
96
- # )
97
- # if validation_errors:
98
- # raise RuntimeError(
99
- # [e.error_message() for e in validation_errors]
100
- # )
154
+ data = convert_datetime_columns_to_int(data)
155
+ data = convert_boolean_columns_to_str(data)
156
+ data = _set_default_columns_for_dataset(data)
157
+ data = convert_default_columns_to_json_str(data)
158
+
159
+ validation_errors = validate_dataset_df(data)
160
+ if validation_errors:
161
+ raise RuntimeError([e.error_message() for e in validation_errors])
101
162
 
102
163
  # Convert to Arrow table
103
164
  try:
104
165
  logger.debug("Converting data to Arrow format")
105
- pa_table = pa.Table.from_pandas(examples)
166
+ pa_table = pa.Table.from_pandas(data, preserve_index=False)
106
167
  except pa.ArrowInvalid as e:
107
168
  logger.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
108
169
  raise pa.ArrowInvalid(
@@ -119,6 +180,7 @@ class DatasetsClient:
119
180
  port=self._sdk_config.flight_server_port,
120
181
  scheme=self._sdk_config.flight_scheme,
121
182
  request_verify=self._sdk_config.request_verify,
183
+ max_chunksize=self._sdk_config.pyarrow_max_chunksize,
122
184
  ) as flight_client:
123
185
  try:
124
186
  response = flight_client.create_dataset(
@@ -140,3 +202,28 @@ class DatasetsClient:
140
202
  # object we make a GET query
141
203
  dataset = self.get(dataset_id=response)
142
204
  return dataset
205
+
206
+
207
+ def _set_default_columns_for_dataset(df: pd.DataFrame) -> pd.DataFrame:
208
+ current_time = int(time.time() * 1000)
209
+ if "created_at" in df.columns:
210
+ if df["created_at"].isnull().values.any():
211
+ df["created_at"].fillna(current_time, inplace=True)
212
+ else:
213
+ df["created_at"] = current_time
214
+
215
+ if "updated_at" in df.columns:
216
+ if df["updated_at"].isnull().values.any():
217
+ df["updated_at"].fillna(current_time, inplace=True)
218
+ else:
219
+ df["updated_at"] = current_time
220
+
221
+ if "id" in df.columns:
222
+ if df["id"].isnull().values.any():
223
+ df["id"] = df["id"].apply(
224
+ lambda x: str(uuid.uuid4()) if pd.isnull(x) else x
225
+ )
226
+ else:
227
+ df["id"] = [str(uuid.uuid4()) for _ in range(len(df))]
228
+
229
+ return df
@@ -0,0 +1,61 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class DatasetError(Exception, ABC):
5
+ def __str__(self) -> str:
6
+ return self.error_message()
7
+
8
+ @abstractmethod
9
+ def __repr__(self) -> str:
10
+ pass
11
+
12
+ @abstractmethod
13
+ def error_message(self) -> str:
14
+ pass
15
+
16
+
17
+ class InvalidSessionError(DatasetError):
18
+ def error_message(self) -> str:
19
+ return (
20
+ "Credentials not provided or invalid. Please pass in the correct api_key when "
21
+ "initiating a new ArizeExportClient. Alternatively, you can set up credentials "
22
+ "in a profile or as an environment variable"
23
+ )
24
+
25
+ def __repr__(self) -> str:
26
+ return "InvalidSessionError()"
27
+
28
+
29
+ class InvalidConfigFileError(DatasetError):
30
+ def error_message(self) -> str:
31
+ return "Invalid/Misconfigured Configuration File"
32
+
33
+ def __repr__(self) -> str:
34
+ return "InvalidConfigFileError()"
35
+
36
+
37
+ class IDColumnUniqueConstraintError(DatasetError):
38
+ def error_message(self) -> str:
39
+ return "'id' column must contain unique values"
40
+
41
+ def __repr__(self) -> str:
42
+ return "IDColumnUniqueConstraintError()"
43
+
44
+
45
+ class RequiredColumnsError(DatasetError):
46
+ def __init__(self, missing_columns: set) -> None:
47
+ self.missing_columns = missing_columns
48
+
49
+ def error_message(self) -> str:
50
+ return f"Missing required columns: {self.missing_columns}"
51
+
52
+ def __repr__(self) -> str:
53
+ return f"RequiredColumnsError({self.missing_columns})"
54
+
55
+
56
+ class EmptyDatasetError(DatasetError):
57
+ def error_message(self) -> str:
58
+ return "DataFrame must have at least one row in it."
59
+
60
+ def __repr__(self) -> str:
61
+ return "EmptyDatasetError()"
@@ -0,0 +1,46 @@
1
+ from typing import List
2
+
3
+ import pandas as pd
4
+
5
+ from arize.datasets import errors as err
6
+
7
+
8
+ def validate_dataset_df(
9
+ df: pd.DataFrame,
10
+ ) -> List[err.DatasetError]:
11
+ ## check all require columns are present
12
+ required_columns_errors = _check_required_columns(df)
13
+ if required_columns_errors:
14
+ return required_columns_errors
15
+
16
+ ## check id column is unique
17
+ id_column_unique_constraint_error = _check_id_column_is_unique(df)
18
+ if id_column_unique_constraint_error:
19
+ return id_column_unique_constraint_error
20
+
21
+ # check DataFrame has at least one row in it
22
+ emtpy_dataframe_error = _check_empty_dataframe(df)
23
+ if emtpy_dataframe_error:
24
+ return emtpy_dataframe_error
25
+
26
+ return []
27
+
28
+
29
+ def _check_required_columns(df: pd.DataFrame) -> List[err.DatasetError]:
30
+ required_columns = ["id", "created_at", "updated_at"]
31
+ missing_columns = set(required_columns) - set(df.columns)
32
+ if missing_columns:
33
+ return [err.RequiredColumnsError(missing_columns)]
34
+ return []
35
+
36
+
37
+ def _check_id_column_is_unique(df: pd.DataFrame) -> List[err.DatasetError]:
38
+ if not df["id"].is_unique:
39
+ return [err.IDColumnUniqueConstraintError()]
40
+ return []
41
+
42
+
43
+ def _check_empty_dataframe(df: pd.DataFrame) -> List[err.DatasetError]:
44
+ if df.empty:
45
+ return [err.EmptyDatasetError()]
46
+ return []