arize 8.0.0b0__py3-none-any.whl → 8.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +1 -1
- arize/_client_factory.py +50 -0
- arize/_flight/client.py +4 -4
- arize/_generated/api_client/__init__.py +0 -2
- arize/_generated/api_client/api/datasets_api.py +6 -6
- arize/_generated/api_client/api/experiments_api.py +6 -6
- arize/_generated/api_client/api/projects_api.py +3 -3
- arize/_generated/api_client/models/__init__.py +0 -1
- arize/_generated/api_client/models/datasets_create_request.py +2 -10
- arize/_generated/api_client/models/datasets_examples_insert_request.py +2 -10
- arize/_generated/api_client/test/test_datasets_create_request.py +2 -6
- arize/_generated/api_client/test/test_datasets_examples_insert_request.py +2 -6
- arize/_generated/api_client/test/test_datasets_examples_list200_response.py +2 -6
- arize/_generated/api_client/test/test_datasets_examples_update_request.py +2 -6
- arize/_generated/api_client/test/test_experiments_create_request.py +2 -6
- arize/_generated/api_client/test/test_experiments_runs_list200_response.py +2 -6
- arize/_generated/api_client_README.md +0 -1
- arize/_lazy.py +25 -9
- arize/client.py +16 -52
- arize/config.py +9 -36
- arize/constants/ml.py +9 -16
- arize/constants/spans.py +5 -10
- arize/datasets/client.py +13 -9
- arize/datasets/errors.py +1 -1
- arize/datasets/validation.py +2 -2
- arize/embeddings/auto_generator.py +2 -2
- arize/embeddings/errors.py +2 -2
- arize/embeddings/tabular_generators.py +1 -1
- arize/exceptions/base.py +0 -52
- arize/exceptions/parameters.py +0 -329
- arize/experiments/__init__.py +2 -2
- arize/experiments/client.py +16 -10
- arize/experiments/evaluators/base.py +6 -6
- arize/experiments/evaluators/executors.py +10 -3
- arize/experiments/evaluators/types.py +2 -2
- arize/experiments/functions.py +24 -17
- arize/experiments/types.py +6 -8
- arize/logging.py +1 -1
- arize/ml/batch_validation/errors.py +10 -1004
- arize/ml/batch_validation/validator.py +273 -225
- arize/ml/casting.py +7 -7
- arize/ml/client.py +12 -11
- arize/ml/proto.py +6 -6
- arize/ml/stream_validation.py +2 -3
- arize/ml/surrogate_explainer/mimic.py +3 -3
- arize/ml/types.py +1 -55
- arize/pre_releases.py +6 -3
- arize/projects/client.py +9 -4
- arize/regions.py +2 -2
- arize/spans/client.py +14 -12
- arize/spans/columns.py +32 -36
- arize/spans/conversion.py +5 -6
- arize/spans/validation/common/argument_validation.py +3 -3
- arize/spans/validation/common/dataframe_form_validation.py +6 -6
- arize/spans/validation/common/value_validation.py +1 -1
- arize/spans/validation/evals/dataframe_form_validation.py +4 -4
- arize/spans/validation/evals/evals_validation.py +6 -6
- arize/spans/validation/metadata/dataframe_form_validation.py +1 -1
- arize/spans/validation/spans/dataframe_form_validation.py +2 -2
- arize/spans/validation/spans/spans_validation.py +6 -6
- arize/utils/arrow.py +2 -2
- arize/utils/cache.py +2 -2
- arize/utils/dataframe.py +4 -4
- arize/utils/online_tasks/dataframe_preprocessor.py +7 -7
- arize/utils/openinference_conversion.py +10 -10
- arize/utils/proto.py +1 -1
- arize/version.py +1 -1
- {arize-8.0.0b0.dist-info → arize-8.0.0b2.dist-info}/METADATA +71 -63
- {arize-8.0.0b0.dist-info → arize-8.0.0b2.dist-info}/RECORD +72 -73
- arize/_generated/api_client/models/primitive_value.py +0 -172
- arize/_generated/api_client/test/test_primitive_value.py +0 -50
- {arize-8.0.0b0.dist-info → arize-8.0.0b2.dist-info}/WHEEL +0 -0
- {arize-8.0.0b0.dist-info → arize-8.0.0b2.dist-info}/licenses/LICENSE +0 -0
- {arize-8.0.0b0.dist-info → arize-8.0.0b2.dist-info}/licenses/NOTICE +0 -0
arize/client.py
CHANGED
|
@@ -20,9 +20,6 @@ if TYPE_CHECKING:
|
|
|
20
20
|
|
|
21
21
|
logger = logging.getLogger(__name__)
|
|
22
22
|
|
|
23
|
-
# TODO(Kiko): Clean commented lines over the SDK
|
|
24
|
-
# TODO(Kiko): Implement https://github.com/Arize-ai/arize/pull/59917
|
|
25
|
-
|
|
26
23
|
# TODO(Kiko): Go private connect. Need a `base_domain`, such that we get:
|
|
27
24
|
# - api.<base_domain>
|
|
28
25
|
# - app.<base_domain>
|
|
@@ -31,29 +28,23 @@ logger = logging.getLogger(__name__)
|
|
|
31
28
|
|
|
32
29
|
# TODO(Kiko): Enforce type checking, remove all type ignores
|
|
33
30
|
|
|
34
|
-
# TODO(Kiko): Go over docstrings
|
|
35
|
-
# TODO(Kiko): Missing parameter descriptions in some docstrings
|
|
36
|
-
# TODO(Kiko): Missing return descriptions in some docstrings
|
|
37
|
-
|
|
38
31
|
# TODO(Kiko): Go over headers on each logging call
|
|
32
|
+
# TODO(Kiko): InvalidAdditionalHeadersError is unused. Have we handled extra headers?
|
|
39
33
|
|
|
40
34
|
# TODO(Kiko): Need to implement 'Update existing examples in a dataset'
|
|
41
35
|
|
|
42
|
-
# TODO(Kiko): why logs don't show on scripts, only on jupyter notebooks
|
|
43
36
|
# TODO(Kiko): test caching in colab environment
|
|
44
37
|
# TODO(Kiko): Protobuf versioning is too old
|
|
45
38
|
# TODO(Kiko): Go through main APIs and add CtxAdapter where missing
|
|
46
39
|
# TODO(Kiko): Search and handle other TODOs
|
|
47
|
-
# TODO(Kiko): Go over **every file** and do not import anything at runtime, use `if TYPE_CHECKING`
|
|
48
|
-
# with `from __future__ import annotations` (must include for Python < 3.11)
|
|
49
40
|
|
|
50
41
|
|
|
51
42
|
class ArizeClient(LazySubclientsMixin):
|
|
52
43
|
"""Root client for the Arize SDK.
|
|
53
44
|
|
|
54
45
|
The ArizeClient provides access to all Arize platform services including datasets,
|
|
55
|
-
experiments, ML models, projects, and spans. It uses SDKConfiguration
|
|
56
|
-
manage configuration settings.
|
|
46
|
+
experiments, ML models, projects, and spans. It uses :class:`arize.config.SDKConfiguration`
|
|
47
|
+
internally to manage configuration settings.
|
|
57
48
|
|
|
58
49
|
All parameters are optional (except api_key which must be provided via argument
|
|
59
50
|
or environment variable). For each parameter, values are resolved in this order:
|
|
@@ -105,44 +96,16 @@ class ArizeClient(LazySubclientsMixin):
|
|
|
105
96
|
"SpansClient",
|
|
106
97
|
),
|
|
107
98
|
}
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
"datasets-experiments",
|
|
119
|
-
(
|
|
120
|
-
"pydantic",
|
|
121
|
-
"wrapt",
|
|
122
|
-
# "numpy",
|
|
123
|
-
# "openinference.semconv",
|
|
124
|
-
# "opentelemetry.sdk",
|
|
125
|
-
# "opentelemetry.exporter.otlp.proto.grpc.trace_exporter",
|
|
126
|
-
),
|
|
127
|
-
),
|
|
128
|
-
"spans": (
|
|
129
|
-
"spans",
|
|
130
|
-
(
|
|
131
|
-
"google.protobuf",
|
|
132
|
-
"numpy",
|
|
133
|
-
"openinference.semconv",
|
|
134
|
-
"opentelemetry",
|
|
135
|
-
"pandas",
|
|
136
|
-
"pyarrow",
|
|
137
|
-
"requests",
|
|
138
|
-
"tqdm",
|
|
139
|
-
),
|
|
140
|
-
),
|
|
141
|
-
# Imports are gated in each method of the models client
|
|
142
|
-
# This is to allow for very lean package install if people only
|
|
143
|
-
# want to stream ML records
|
|
144
|
-
"ml": (None, ()),
|
|
145
|
-
}
|
|
99
|
+
# DISABLED: Optional dependency gating system
|
|
100
|
+
# This dict would map subclients to their optional dependencies and extra names.
|
|
101
|
+
# When enabled, it prevents loading subclients if dependencies aren't installed,
|
|
102
|
+
# showing: "Install via: pip install arize[extra-name]"
|
|
103
|
+
#
|
|
104
|
+
# To re-enable, populate with entries like:
|
|
105
|
+
# "subclient_name": ("extra-name", ("package1", "package2", "package3")),
|
|
106
|
+
# "another_subclient": (None, ()), # No requirements
|
|
107
|
+
#
|
|
108
|
+
_EXTRAS: ClassVar[dict[str, tuple[str | None, tuple[str, ...]]]] = {}
|
|
146
109
|
|
|
147
110
|
def __init__(
|
|
148
111
|
self,
|
|
@@ -175,7 +138,7 @@ class ArizeClient(LazySubclientsMixin):
|
|
|
175
138
|
api_key: Arize API key for authentication. Required - must be provided here
|
|
176
139
|
or via ARIZE_API_KEY environment variable. Raises MissingAPIKeyError if not set.
|
|
177
140
|
region: Arize region (e.g., Region.US_CENTRAL, Region.EU_WEST). When specified,
|
|
178
|
-
overrides individual host/port settings. ENV: ARIZE_REGION. Default: Region.
|
|
141
|
+
overrides individual host/port settings. ENV: ARIZE_REGION. Default: Region.UNSET.
|
|
179
142
|
api_host: Custom API endpoint host. ENV: ARIZE_API_HOST. Default: "api.arize.com".
|
|
180
143
|
api_scheme: API endpoint scheme (http/https). ENV: ARIZE_API_SCHEME. Default: "https".
|
|
181
144
|
otlp_host: OTLP endpoint host. ENV: ARIZE_OTLP_HOST. Default: "otlp.arize.com".
|
|
@@ -210,7 +173,8 @@ class ArizeClient(LazySubclientsMixin):
|
|
|
210
173
|
|
|
211
174
|
Notes:
|
|
212
175
|
Values provided to this class override environment variables, which in turn
|
|
213
|
-
override default values. See SDKConfiguration
|
|
176
|
+
override default values. See :class:`arize.config.SDKConfiguration`
|
|
177
|
+
for detailed parameter documentation.
|
|
214
178
|
"""
|
|
215
179
|
cfg_kwargs: dict = {}
|
|
216
180
|
if api_key is not None:
|
arize/config.py
CHANGED
|
@@ -3,10 +3,8 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import sys
|
|
6
|
-
import threading
|
|
7
6
|
from dataclasses import dataclass, field, fields
|
|
8
7
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
10
8
|
|
|
11
9
|
from arize.constants.config import (
|
|
12
10
|
DEFAULT_API_HOST,
|
|
@@ -167,6 +165,9 @@ def _parse_bool(val: bool | str | None) -> bool:
|
|
|
167
165
|
class SDKConfiguration:
|
|
168
166
|
"""Configuration for the Arize SDK with endpoint and authentication settings.
|
|
169
167
|
|
|
168
|
+
This class holds pure configuration data and does not manage client lifecycle.
|
|
169
|
+
Client creation and caching is handled by :class:`arize.ArizeClient`.
|
|
170
|
+
|
|
170
171
|
This class is used internally by ArizeClient to manage SDK configuration. It is not
|
|
171
172
|
recommended to use this class directly; users should interact with ArizeClient
|
|
172
173
|
instead.
|
|
@@ -225,13 +226,16 @@ class SDKConfiguration:
|
|
|
225
226
|
region: Arize region (e.g., US_CENTRAL, EU_WEST). When specified, overrides
|
|
226
227
|
individual host/port settings.
|
|
227
228
|
Environment variable: ARIZE_REGION.
|
|
228
|
-
Default: Region.
|
|
229
|
+
Default: :class:`Region.UNSET`.
|
|
229
230
|
single_host: Single host to use for all endpoints. Overrides individual host settings.
|
|
230
231
|
Environment variable: ARIZE_SINGLE_HOST.
|
|
231
232
|
Default: "" (not set).
|
|
232
233
|
single_port: Single port to use for all endpoints. Overrides individual port settings (0-65535).
|
|
233
234
|
Environment variable: ARIZE_SINGLE_PORT.
|
|
234
235
|
Default: 0 (not set).
|
|
236
|
+
|
|
237
|
+
Raises:
|
|
238
|
+
MissingAPIKeyError: If api_key is not provided via argument or environment variable.
|
|
235
239
|
"""
|
|
236
240
|
|
|
237
241
|
api_key: str = field(
|
|
@@ -323,17 +327,11 @@ class SDKConfiguration:
|
|
|
323
327
|
)
|
|
324
328
|
)
|
|
325
329
|
|
|
326
|
-
# Private, excluded from comparisons & repr
|
|
327
|
-
_gen_client: Any = field(default=None, repr=False, compare=False)
|
|
328
|
-
_gen_lock: threading.Lock = field(
|
|
329
|
-
default_factory=threading.Lock, repr=False, compare=False
|
|
330
|
-
)
|
|
331
|
-
|
|
332
330
|
def __post_init__(self) -> None:
|
|
333
331
|
"""Validate and configure SDK endpoints after initialization.
|
|
334
332
|
|
|
335
333
|
Raises:
|
|
336
|
-
MissingAPIKeyError: If
|
|
334
|
+
MissingAPIKeyError: If api_key is not provided via argument or environment variable.
|
|
337
335
|
"""
|
|
338
336
|
# Validate Configuration
|
|
339
337
|
if not self.api_key:
|
|
@@ -341,7 +339,7 @@ class SDKConfiguration:
|
|
|
341
339
|
|
|
342
340
|
has_single_host = bool(self.single_host)
|
|
343
341
|
has_single_port = self.single_port != 0
|
|
344
|
-
has_region = self.region is not Region.
|
|
342
|
+
has_region = self.region is not Region.UNSET
|
|
345
343
|
if (has_single_host or has_single_port) and has_region:
|
|
346
344
|
logger.info(
|
|
347
345
|
"Multiple endpoint override options provided. Preference order is: "
|
|
@@ -444,28 +442,3 @@ class SDKConfiguration:
|
|
|
444
442
|
lines.append(f" {f.name}={val!r},")
|
|
445
443
|
lines.append(")")
|
|
446
444
|
return "\n".join(lines)
|
|
447
|
-
|
|
448
|
-
# TODO(Kiko): This may not be well placed in this class
|
|
449
|
-
def get_generated_client(self) -> object:
|
|
450
|
-
"""Get or create the generated OpenAPI client instance."""
|
|
451
|
-
# If already cached, return immediately
|
|
452
|
-
if self._gen_client is not None:
|
|
453
|
-
return self._gen_client
|
|
454
|
-
|
|
455
|
-
# Thread-safe initialization
|
|
456
|
-
with self._gen_lock:
|
|
457
|
-
if self._gen_client is not None:
|
|
458
|
-
return self._gen_client
|
|
459
|
-
|
|
460
|
-
# Import lazily so extra dependencies can be
|
|
461
|
-
# enforced outside the configuration class
|
|
462
|
-
from arize._generated import api_client as gen
|
|
463
|
-
|
|
464
|
-
cfg = gen.Configuration(host=self.api_url)
|
|
465
|
-
if self.api_key:
|
|
466
|
-
cfg.access_token = self.api_key
|
|
467
|
-
client = gen.ApiClient(cfg)
|
|
468
|
-
|
|
469
|
-
# Bypass frozen to set the cache once
|
|
470
|
-
object.__setattr__(self, "_gen_client", client)
|
|
471
|
-
return client
|
arize/constants/ml.py
CHANGED
|
@@ -3,50 +3,43 @@
|
|
|
3
3
|
import json
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
-
# MAX_BYTES_PER_BULK_RECORD = 100000
|
|
7
|
-
# MAX_DAYS_WITHIN_RANGE = 365
|
|
8
6
|
MIN_PREDICTION_ID_LEN = 1
|
|
9
7
|
MAX_PREDICTION_ID_LEN = 512
|
|
10
8
|
MIN_DOCUMENT_ID_LEN = 1
|
|
11
9
|
MAX_DOCUMENT_ID_LEN = 128
|
|
12
|
-
#
|
|
10
|
+
# The maximum number of character for tag values
|
|
13
11
|
MAX_TAG_LENGTH = 20_000
|
|
14
12
|
MAX_TAG_LENGTH_TRUNCATION = 1_000
|
|
15
|
-
#
|
|
13
|
+
# The maximum number of character for embedding raw data
|
|
16
14
|
MAX_RAW_DATA_CHARACTERS = 2_000_000
|
|
17
15
|
MAX_RAW_DATA_CHARACTERS_TRUNCATION = 5_000
|
|
18
16
|
# The maximum number of acceptable years in the past from current time for prediction_timestamps
|
|
19
17
|
MAX_PAST_YEARS_FROM_CURRENT_TIME = 5
|
|
20
18
|
# The maximum number of acceptable years in the future from current time for prediction_timestamps
|
|
21
19
|
MAX_FUTURE_YEARS_FROM_CURRENT_TIME = 1
|
|
22
|
-
#
|
|
20
|
+
# The maximum number of character for llm model name
|
|
23
21
|
MAX_LLM_MODEL_NAME_LENGTH = 20_000
|
|
24
22
|
MAX_LLM_MODEL_NAME_LENGTH_TRUNCATION = 50
|
|
25
|
-
#
|
|
23
|
+
# The maximum number of character for prompt template
|
|
26
24
|
MAX_PROMPT_TEMPLATE_LENGTH = 50_000
|
|
27
25
|
MAX_PROMPT_TEMPLATE_LENGTH_TRUNCATION = 5_000
|
|
28
|
-
#
|
|
26
|
+
# The maximum number of character for prompt template version
|
|
29
27
|
MAX_PROMPT_TEMPLATE_VERSION_LENGTH = 20_000
|
|
30
28
|
MAX_PROMPT_TEMPLATE_VERSION_LENGTH_TRUNCATION = 50
|
|
31
|
-
#
|
|
29
|
+
# The maximum number of embeddings
|
|
32
30
|
MAX_NUMBER_OF_EMBEDDINGS = 30
|
|
33
31
|
MAX_EMBEDDING_DIMENSIONALITY = 20_000
|
|
34
|
-
#
|
|
32
|
+
# The maximum number of classes for multi class
|
|
35
33
|
MAX_NUMBER_OF_MULTI_CLASS_CLASSES = 500
|
|
36
34
|
MAX_MULTI_CLASS_NAME_LENGTH = 100
|
|
37
35
|
# The maximum number of references in embedding similarity search params
|
|
38
36
|
MAX_NUMBER_OF_SIMILARITY_REFERENCES = 10
|
|
39
|
-
#
|
|
40
|
-
# # Arize generated columns
|
|
41
|
-
# GENERATED_PREDICTION_LABEL_COL = "arize_generated_prediction_label"
|
|
42
|
-
# GENERATED_LLM_PARAMS_JSON_COL = "arize_generated_llm_params_json"
|
|
43
|
-
#
|
|
44
|
-
# # reserved columns for LLM run metadata
|
|
37
|
+
# reserved columns for LLM run metadata
|
|
45
38
|
LLM_RUN_METADATA_TOTAL_TOKEN_COUNT_TAG_NAME = "total_token_count" # noqa: S105
|
|
46
39
|
LLM_RUN_METADATA_PROMPT_TOKEN_COUNT_TAG_NAME = "prompt_token_count" # noqa: S105
|
|
47
40
|
LLM_RUN_METADATA_RESPONSE_TOKEN_COUNT_TAG_NAME = "response_token_count" # noqa: S105
|
|
48
41
|
LLM_RUN_METADATA_RESPONSE_LATENCY_MS_TAG_NAME = "response_latency_ms"
|
|
49
|
-
|
|
42
|
+
|
|
50
43
|
# all reserved tags
|
|
51
44
|
RESERVED_TAG_COLS = [
|
|
52
45
|
LLM_RUN_METADATA_TOTAL_TOKEN_COUNT_TAG_NAME,
|
arize/constants/spans.py
CHANGED
|
@@ -5,19 +5,15 @@ DEFAULT_DATETIME_FMT = "%Y-%m-%dT%H:%M:%S.%f+00:00"
|
|
|
5
5
|
# Minumum/Maximum number of characters for span/trace/parent ids in spans
|
|
6
6
|
SPAN_ID_MIN_STR_LENGTH = 12
|
|
7
7
|
SPAN_ID_MAX_STR_LENGTH = 128
|
|
8
|
-
#
|
|
8
|
+
# Minumum/Maximum number of characters for span name
|
|
9
9
|
SPAN_NAME_MIN_STR_LENGTH = 0
|
|
10
10
|
SPAN_NAME_MAX_STR_LENGTH = 50
|
|
11
|
-
#
|
|
11
|
+
# Minumum/Maximum number of characters for span status message
|
|
12
12
|
SPAN_STATUS_MSG_MIN_STR_LENGTH = 0
|
|
13
13
|
SPAN_STATUS_MSG_MAX_STR_LENGTH = 10_000
|
|
14
|
-
#
|
|
14
|
+
# Minumum/Maximum number of characters for span event name
|
|
15
15
|
SPAN_EVENT_NAME_MAX_STR_LENGTH = 100
|
|
16
|
-
#
|
|
17
|
-
# SPAN_EVENT_ATTRS_MAX_STR_LENGTH = 10_000
|
|
18
|
-
# # Maximum number of characters for span kind
|
|
19
|
-
# SPAN_KIND_MAX_STR_LENGTH = 100
|
|
20
|
-
# SPAN_EXCEPTION_TYPE_MAX_STR_LENGTH = 100
|
|
16
|
+
# Minumum/Maximum number of characters for span event attributes
|
|
21
17
|
SPAN_EXCEPTION_MESSAGE_MAX_STR_LENGTH = 100
|
|
22
18
|
SPAN_EXCEPTION_STACK_TRACE_MAX_STR_LENGTH = 10_000
|
|
23
19
|
SPAN_IO_VALUE_MAX_STR_LENGTH = 4_000_000
|
|
@@ -29,7 +25,6 @@ SPAN_LLM_MESSAGE_ROLE_MAX_STR_LENGTH = 100
|
|
|
29
25
|
SPAN_LLM_MESSAGE_CONTENT_MAX_STR_LENGTH = 4_000_000
|
|
30
26
|
SPAN_LLM_TOOL_CALL_FUNCTION_NAME_MAX_STR_LENGTH = 500
|
|
31
27
|
SPAN_LLM_PROMPT_TEMPLATE_MAX_STR_LENGTH = 4_000_000
|
|
32
|
-
# SPAN_LLM_PROMPT_TEMPLATE_VARIABLES_MAX_STR_LENGTH = 10_000
|
|
33
28
|
SPAN_LLM_PROMPT_TEMPLATE_VERSION_MAX_STR_LENGTH = 100
|
|
34
29
|
SPAN_TOOL_NAME_MAX_STR_LENGTH = 100
|
|
35
30
|
SPAN_TOOL_DESCRIPTION_MAX_STR_LENGTH = 1_000
|
|
@@ -43,7 +38,7 @@ JSON_STRING_MAX_STR_LENGTH = 4_000_000
|
|
|
43
38
|
EVAL_LABEL_MIN_STR_LENGTH = 1 # we do not accept empty strings
|
|
44
39
|
EVAL_LABEL_MAX_STR_LENGTH = 100
|
|
45
40
|
EVAL_EXPLANATION_MAX_STR_LENGTH = 10_000
|
|
46
|
-
|
|
41
|
+
|
|
47
42
|
# # Annotation related constants
|
|
48
43
|
ANNOTATION_LABEL_MIN_STR_LENGTH = 1
|
|
49
44
|
ANNOTATION_LABEL_MAX_STR_LENGTH = 100 # Max length for annotation label string
|
arize/datasets/client.py
CHANGED
|
@@ -24,6 +24,7 @@ from arize.utils.openinference_conversion import (
|
|
|
24
24
|
from arize.utils.size import get_payload_size_mb
|
|
25
25
|
|
|
26
26
|
if TYPE_CHECKING:
|
|
27
|
+
from arize._generated.api_client.api_client import ApiClient
|
|
27
28
|
from arize.config import SDKConfiguration
|
|
28
29
|
|
|
29
30
|
logger = logging.getLogger(__name__)
|
|
@@ -41,18 +42,21 @@ class DatasetsClient:
|
|
|
41
42
|
:class:`arize.config.SDKConfiguration`.
|
|
42
43
|
"""
|
|
43
44
|
|
|
44
|
-
def __init__(
|
|
45
|
+
def __init__(
|
|
46
|
+
self, *, sdk_config: SDKConfiguration, generated_client: ApiClient
|
|
47
|
+
) -> None:
|
|
45
48
|
"""
|
|
46
49
|
Args:
|
|
47
50
|
sdk_config: Resolved SDK configuration.
|
|
51
|
+
generated_client: Shared generated API client instance.
|
|
48
52
|
""" # noqa: D205, D212
|
|
49
53
|
self._sdk_config = sdk_config
|
|
50
54
|
|
|
51
55
|
# Import at runtime so it's still lazy and extras-gated by the parent
|
|
52
56
|
from arize._generated import api_client as gen
|
|
53
57
|
|
|
54
|
-
# Use the
|
|
55
|
-
self._api = gen.DatasetsApi(
|
|
58
|
+
# Use the provided client directly
|
|
59
|
+
self._api = gen.DatasetsApi(generated_client)
|
|
56
60
|
|
|
57
61
|
@prerelease_endpoint(key="datasets.list", stage=ReleaseStage.BETA)
|
|
58
62
|
def list(
|
|
@@ -117,7 +121,7 @@ class DatasetsClient:
|
|
|
117
121
|
space_id: Space ID to create the dataset in.
|
|
118
122
|
examples: Dataset examples either as:
|
|
119
123
|
- a list of JSON-like dicts, or
|
|
120
|
-
- a pandas
|
|
124
|
+
- a :class:`pandas.DataFrame` (will be converted to records for REST).
|
|
121
125
|
force_http: If True, force REST upload even if the payload exceeds the
|
|
122
126
|
configured REST payload threshold.
|
|
123
127
|
|
|
@@ -125,7 +129,7 @@ class DatasetsClient:
|
|
|
125
129
|
The created dataset object as returned by the API.
|
|
126
130
|
|
|
127
131
|
Raises:
|
|
128
|
-
TypeError: If `examples` is not a list of dicts or a pandas
|
|
132
|
+
TypeError: If `examples` is not a list of dicts or a :class:`pandas.DataFrame`.
|
|
129
133
|
RuntimeError: If the Flight upload path is selected and the Flight request
|
|
130
134
|
fails.
|
|
131
135
|
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
@@ -205,7 +209,8 @@ class DatasetsClient:
|
|
|
205
209
|
Args:
|
|
206
210
|
dataset_id: Dataset ID to delete.
|
|
207
211
|
|
|
208
|
-
Returns:
|
|
212
|
+
Returns:
|
|
213
|
+
This method returns None on success (common empty 204 response).
|
|
209
214
|
|
|
210
215
|
Raises:
|
|
211
216
|
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
@@ -354,14 +359,13 @@ class DatasetsClient:
|
|
|
354
359
|
the latest dataset version is selected.
|
|
355
360
|
examples: Examples to append, provided as either:
|
|
356
361
|
- a list of JSON-like dicts, or
|
|
357
|
-
- a pandas
|
|
362
|
+
- a :class:`pandas.DataFrame` (converted to records before upload).
|
|
358
363
|
|
|
359
364
|
Returns:
|
|
360
365
|
The updated dataset object. To see the examples, use `list_examples()`.
|
|
361
366
|
|
|
362
367
|
Raises:
|
|
363
|
-
AssertionError: If `examples` is not a list of dicts or a pandas
|
|
364
|
-
DataFrame.
|
|
368
|
+
AssertionError: If `examples` is not a list of dicts or a :class:`pandas.DataFrame`.
|
|
365
369
|
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
366
370
|
returns an error response (e.g. 400/401/403/404/429).
|
|
367
371
|
"""
|
arize/datasets/errors.py
CHANGED
|
@@ -80,7 +80,7 @@ class RequiredColumnsError(DatasetError):
|
|
|
80
80
|
|
|
81
81
|
|
|
82
82
|
class EmptyDatasetError(DatasetError):
|
|
83
|
-
"""Raised when dataset DataFrame has no rows."""
|
|
83
|
+
"""Raised when dataset :class:`pandas.DataFrame` has no rows."""
|
|
84
84
|
|
|
85
85
|
def error_message(self) -> str:
|
|
86
86
|
"""Return the error message for this exception."""
|
arize/datasets/validation.py
CHANGED
|
@@ -8,12 +8,12 @@ from arize.datasets import errors as err
|
|
|
8
8
|
def validate_dataset_df(
|
|
9
9
|
df: pd.DataFrame,
|
|
10
10
|
) -> list[err.DatasetError]:
|
|
11
|
-
"""Validate a dataset DataFrame for structural and content errors.
|
|
11
|
+
"""Validate a dataset :class:`pandas.DataFrame` for structural and content errors.
|
|
12
12
|
|
|
13
13
|
Checks for required columns, unique ID values, and non-empty data.
|
|
14
14
|
|
|
15
15
|
Args:
|
|
16
|
-
df: The pandas
|
|
16
|
+
df: The :class:`pandas.DataFrame` to validate.
|
|
17
17
|
|
|
18
18
|
Returns:
|
|
19
19
|
A list of DatasetError objects found during validation. Empty list if valid.
|
|
@@ -62,7 +62,7 @@ class EmbeddingGenerator:
|
|
|
62
62
|
|
|
63
63
|
@classmethod
|
|
64
64
|
def list_default_models(cls) -> pd.DataFrame:
|
|
65
|
-
"""Return a DataFrame of default models for each use case."""
|
|
65
|
+
"""Return a :class:`pandas.DataFrame` of default models for each use case."""
|
|
66
66
|
df = pd.DataFrame(
|
|
67
67
|
{
|
|
68
68
|
"Area": ["NLP", "NLP", "CV", "CV", "STRUCTURED"],
|
|
@@ -87,7 +87,7 @@ class EmbeddingGenerator:
|
|
|
87
87
|
|
|
88
88
|
@classmethod
|
|
89
89
|
def list_pretrained_models(cls) -> pd.DataFrame:
|
|
90
|
-
"""Return a DataFrame of all available pretrained models."""
|
|
90
|
+
"""Return a :class:`pandas.DataFrame` of all available pretrained models."""
|
|
91
91
|
data = {
|
|
92
92
|
"Task": ["NLP" for _ in NLP_PRETRAINED_MODELS]
|
|
93
93
|
+ ["CV" for _ in CV_PRETRAINED_MODELS],
|
arize/embeddings/errors.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class InvalidIndexError(Exception):
|
|
5
|
-
"""Raised when DataFrame or Series has an invalid index."""
|
|
5
|
+
"""Raised when :class:`pandas.DataFrame` or Series has an invalid index."""
|
|
6
6
|
|
|
7
7
|
def __repr__(self) -> str:
|
|
8
8
|
"""Return a string representation for debugging and logging."""
|
|
@@ -16,7 +16,7 @@ class InvalidIndexError(Exception):
|
|
|
16
16
|
"""Initialize the exception with field name context.
|
|
17
17
|
|
|
18
18
|
Args:
|
|
19
|
-
field_name: Name of the DataFrame or Series field with invalid index.
|
|
19
|
+
field_name: Name of the :class:`pandas.DataFrame` or Series field with invalid index.
|
|
20
20
|
"""
|
|
21
21
|
self.field_name = field_name
|
|
22
22
|
|
|
@@ -172,5 +172,5 @@ class EmbeddingGeneratorForTabularFeatures(NLPEmbeddingGenerator):
|
|
|
172
172
|
|
|
173
173
|
@staticmethod
|
|
174
174
|
def list_pretrained_models() -> pd.DataFrame:
|
|
175
|
-
"""Return a DataFrame of available pretrained tabular models."""
|
|
175
|
+
"""Return a :class:`pandas.DataFrame` of available pretrained tabular models."""
|
|
176
176
|
return pd.DataFrame({"Model Name": sorted(TABULAR_PRETRAINED_MODELS)})
|
arize/exceptions/base.py
CHANGED
|
@@ -39,21 +39,6 @@ class ValidationFailure(Exception):
|
|
|
39
39
|
self.errors = errors
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
# ----------------------
|
|
43
|
-
# Minimum required checks
|
|
44
|
-
# ----------------------
|
|
45
|
-
# class InvalidColumnNameEmptyString(ValidationError):
|
|
46
|
-
# def __repr__(self) -> str:
|
|
47
|
-
# return "Invalid_Column_Name_Empty_String"
|
|
48
|
-
#
|
|
49
|
-
# def error_message(self) -> str:
|
|
50
|
-
# return (
|
|
51
|
-
# "Empty column name found: ''. The schema cannot point to columns in the "
|
|
52
|
-
# "dataframe denoted by an empty string. You can see the columns used in the "
|
|
53
|
-
# "schema by running schema.get_used_columns()"
|
|
54
|
-
# )
|
|
55
|
-
|
|
56
|
-
|
|
57
42
|
class InvalidFieldTypeConversion(ValidationError):
|
|
58
43
|
"""Raised when fields cannot be converted to required type."""
|
|
59
44
|
|
|
@@ -79,31 +64,6 @@ class InvalidFieldTypeConversion(ValidationError):
|
|
|
79
64
|
)
|
|
80
65
|
|
|
81
66
|
|
|
82
|
-
# class InvalidFieldTypeEmbeddingFeatures(ValidationError):
|
|
83
|
-
# def __repr__(self) -> str:
|
|
84
|
-
# return "Invalid_Input_Type_Embedding_Features"
|
|
85
|
-
#
|
|
86
|
-
# def __init__(self) -> None:
|
|
87
|
-
# pass
|
|
88
|
-
#
|
|
89
|
-
# def error_message(self) -> str:
|
|
90
|
-
# return (
|
|
91
|
-
# "schema.embedding_feature_column_names should be a dictionary mapping strings "
|
|
92
|
-
# "to EmbeddingColumnNames objects"
|
|
93
|
-
# )
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
# class InvalidFieldTypePromptResponse(ValidationError):
|
|
97
|
-
# def __repr__(self) -> str:
|
|
98
|
-
# return "Invalid_Input_Type_Prompt_Response"
|
|
99
|
-
#
|
|
100
|
-
# def __init__(self, name: str) -> None:
|
|
101
|
-
# self.name = name
|
|
102
|
-
#
|
|
103
|
-
# def error_message(self) -> str:
|
|
104
|
-
# return f"'{self.name}' must be of type str or EmbeddingColumnNames"
|
|
105
|
-
|
|
106
|
-
|
|
107
67
|
class InvalidDataFrameIndex(ValidationError):
|
|
108
68
|
"""Raised when DataFrame has an invalid index that needs to be reset."""
|
|
109
69
|
|
|
@@ -117,15 +77,3 @@ class InvalidDataFrameIndex(ValidationError):
|
|
|
117
77
|
"The index of the dataframe is invalid; "
|
|
118
78
|
"reset the index by using df.reset_index(drop=True, inplace=True)"
|
|
119
79
|
)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
# class InvalidSchemaType(ValidationError):
|
|
123
|
-
# def __repr__(self) -> str:
|
|
124
|
-
# return "Invalid_Schema_Type"
|
|
125
|
-
#
|
|
126
|
-
# def __init__(self, schema_type: str, environment: Environments) -> None:
|
|
127
|
-
# self.schema_type = schema_type
|
|
128
|
-
# self.environment = environment
|
|
129
|
-
#
|
|
130
|
-
# def error_message(self) -> str:
|
|
131
|
-
# return f"Cannot use a {self.schema_type} for a model with environment: {self.environment}"
|