arize 8.0.0b1__py3-none-any.whl → 8.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +9 -2
- arize/_client_factory.py +50 -0
- arize/_exporter/client.py +18 -17
- arize/_exporter/parsers/tracing_data_parser.py +9 -4
- arize/_exporter/validation.py +1 -1
- arize/_flight/client.py +37 -17
- arize/_generated/api_client/api/datasets_api.py +6 -6
- arize/_generated/api_client/api/experiments_api.py +6 -6
- arize/_generated/api_client/api/projects_api.py +3 -3
- arize/_lazy.py +61 -10
- arize/client.py +66 -50
- arize/config.py +175 -48
- arize/constants/config.py +1 -0
- arize/constants/ml.py +9 -16
- arize/constants/spans.py +5 -10
- arize/datasets/client.py +45 -28
- arize/datasets/errors.py +1 -1
- arize/datasets/validation.py +2 -2
- arize/embeddings/auto_generator.py +16 -9
- arize/embeddings/base_generators.py +15 -9
- arize/embeddings/cv_generators.py +2 -2
- arize/embeddings/errors.py +2 -2
- arize/embeddings/nlp_generators.py +8 -8
- arize/embeddings/tabular_generators.py +6 -6
- arize/exceptions/base.py +0 -52
- arize/exceptions/config.py +22 -0
- arize/exceptions/parameters.py +1 -330
- arize/exceptions/values.py +8 -5
- arize/experiments/__init__.py +4 -0
- arize/experiments/client.py +31 -18
- arize/experiments/evaluators/base.py +12 -9
- arize/experiments/evaluators/executors.py +16 -7
- arize/experiments/evaluators/rate_limiters.py +3 -1
- arize/experiments/evaluators/types.py +9 -7
- arize/experiments/evaluators/utils.py +7 -5
- arize/experiments/functions.py +128 -58
- arize/experiments/tracing.py +4 -1
- arize/experiments/types.py +34 -31
- arize/logging.py +54 -33
- arize/ml/batch_validation/errors.py +10 -1004
- arize/ml/batch_validation/validator.py +351 -291
- arize/ml/bounded_executor.py +25 -6
- arize/ml/casting.py +51 -33
- arize/ml/client.py +43 -35
- arize/ml/proto.py +21 -22
- arize/ml/stream_validation.py +64 -27
- arize/ml/surrogate_explainer/mimic.py +18 -10
- arize/ml/types.py +27 -67
- arize/pre_releases.py +10 -6
- arize/projects/client.py +9 -4
- arize/py.typed +0 -0
- arize/regions.py +11 -11
- arize/spans/client.py +125 -31
- arize/spans/columns.py +32 -36
- arize/spans/conversion.py +12 -11
- arize/spans/validation/annotations/dataframe_form_validation.py +1 -1
- arize/spans/validation/annotations/value_validation.py +11 -14
- arize/spans/validation/common/argument_validation.py +3 -3
- arize/spans/validation/common/dataframe_form_validation.py +7 -7
- arize/spans/validation/common/value_validation.py +11 -14
- arize/spans/validation/evals/dataframe_form_validation.py +4 -4
- arize/spans/validation/evals/evals_validation.py +6 -6
- arize/spans/validation/evals/value_validation.py +1 -1
- arize/spans/validation/metadata/argument_validation.py +1 -1
- arize/spans/validation/metadata/dataframe_form_validation.py +2 -2
- arize/spans/validation/metadata/value_validation.py +23 -1
- arize/spans/validation/spans/dataframe_form_validation.py +2 -2
- arize/spans/validation/spans/spans_validation.py +6 -6
- arize/utils/arrow.py +38 -2
- arize/utils/cache.py +2 -2
- arize/utils/dataframe.py +4 -4
- arize/utils/online_tasks/dataframe_preprocessor.py +15 -11
- arize/utils/openinference_conversion.py +10 -10
- arize/utils/proto.py +0 -1
- arize/utils/types.py +6 -6
- arize/version.py +1 -1
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/METADATA +32 -7
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/RECORD +81 -78
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/WHEEL +0 -0
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/LICENSE +0 -0
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/NOTICE +0 -0
arize/datasets/client.py
CHANGED
|
@@ -5,7 +5,7 @@ from __future__ import annotations
|
|
|
5
5
|
import logging
|
|
6
6
|
import time
|
|
7
7
|
import uuid
|
|
8
|
-
from typing import TYPE_CHECKING
|
|
8
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
9
9
|
|
|
10
10
|
import pandas as pd
|
|
11
11
|
import pyarrow as pa
|
|
@@ -24,6 +24,11 @@ from arize.utils.openinference_conversion import (
|
|
|
24
24
|
from arize.utils.size import get_payload_size_mb
|
|
25
25
|
|
|
26
26
|
if TYPE_CHECKING:
|
|
27
|
+
# builtins is needed to use builtins.list in type annotations because
|
|
28
|
+
# the class has a list() method that shadows the built-in list type
|
|
29
|
+
import builtins
|
|
30
|
+
|
|
31
|
+
from arize._generated.api_client.api_client import ApiClient
|
|
27
32
|
from arize.config import SDKConfiguration
|
|
28
33
|
|
|
29
34
|
logger = logging.getLogger(__name__)
|
|
@@ -41,18 +46,21 @@ class DatasetsClient:
|
|
|
41
46
|
:class:`arize.config.SDKConfiguration`.
|
|
42
47
|
"""
|
|
43
48
|
|
|
44
|
-
def __init__(
|
|
49
|
+
def __init__(
|
|
50
|
+
self, *, sdk_config: SDKConfiguration, generated_client: ApiClient
|
|
51
|
+
) -> None:
|
|
45
52
|
"""
|
|
46
53
|
Args:
|
|
47
54
|
sdk_config: Resolved SDK configuration.
|
|
55
|
+
generated_client: Shared generated API client instance.
|
|
48
56
|
""" # noqa: D205, D212
|
|
49
57
|
self._sdk_config = sdk_config
|
|
50
58
|
|
|
51
59
|
# Import at runtime so it's still lazy and extras-gated by the parent
|
|
52
60
|
from arize._generated import api_client as gen
|
|
53
61
|
|
|
54
|
-
# Use the
|
|
55
|
-
self._api = gen.DatasetsApi(
|
|
62
|
+
# Use the provided client directly
|
|
63
|
+
self._api = gen.DatasetsApi(generated_client)
|
|
56
64
|
|
|
57
65
|
@prerelease_endpoint(key="datasets.list", stage=ReleaseStage.BETA)
|
|
58
66
|
def list(
|
|
@@ -93,7 +101,7 @@ class DatasetsClient:
|
|
|
93
101
|
*,
|
|
94
102
|
name: str,
|
|
95
103
|
space_id: str,
|
|
96
|
-
examples: list[dict[str, object]] | pd.DataFrame,
|
|
104
|
+
examples: builtins.list[dict[str, object]] | pd.DataFrame,
|
|
97
105
|
force_http: bool = False,
|
|
98
106
|
) -> models.Dataset:
|
|
99
107
|
"""Create a dataset with JSON examples.
|
|
@@ -117,7 +125,7 @@ class DatasetsClient:
|
|
|
117
125
|
space_id: Space ID to create the dataset in.
|
|
118
126
|
examples: Dataset examples either as:
|
|
119
127
|
- a list of JSON-like dicts, or
|
|
120
|
-
- a pandas
|
|
128
|
+
- a :class:`pandas.DataFrame` (will be converted to records for REST).
|
|
121
129
|
force_http: If True, force REST upload even if the payload exceeds the
|
|
122
130
|
configured REST payload threshold.
|
|
123
131
|
|
|
@@ -125,7 +133,7 @@ class DatasetsClient:
|
|
|
125
133
|
The created dataset object as returned by the API.
|
|
126
134
|
|
|
127
135
|
Raises:
|
|
128
|
-
TypeError: If `examples` is not a list of dicts or a pandas
|
|
136
|
+
TypeError: If `examples` is not a list of dicts or a :class:`pandas.DataFrame`.
|
|
129
137
|
RuntimeError: If the Flight upload path is selected and the Flight request
|
|
130
138
|
fails.
|
|
131
139
|
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
@@ -146,7 +154,7 @@ class DatasetsClient:
|
|
|
146
154
|
from arize._generated import api_client as gen
|
|
147
155
|
|
|
148
156
|
data = (
|
|
149
|
-
examples.to_dict(orient="records")
|
|
157
|
+
examples.to_dict(orient="records")
|
|
150
158
|
if isinstance(examples, pd.DataFrame)
|
|
151
159
|
else examples
|
|
152
160
|
)
|
|
@@ -154,7 +162,8 @@ class DatasetsClient:
|
|
|
154
162
|
body = gen.DatasetsCreateRequest(
|
|
155
163
|
name=name,
|
|
156
164
|
space_id=space_id,
|
|
157
|
-
|
|
165
|
+
# Cast: pandas to_dict returns dict[Hashable, Any] but API requires dict[str, Any]
|
|
166
|
+
examples=cast("list[dict[str, Any]]", data),
|
|
158
167
|
)
|
|
159
168
|
return self._api.datasets_create(datasets_create_request=body)
|
|
160
169
|
|
|
@@ -165,15 +174,12 @@ class DatasetsClient:
|
|
|
165
174
|
"Trying to convert to DataFrame for more efficient upload via "
|
|
166
175
|
"gRPC + Flight."
|
|
167
176
|
)
|
|
168
|
-
|
|
169
|
-
examples
|
|
170
|
-
if isinstance(examples, pd.DataFrame)
|
|
171
|
-
else pd.DataFrame(examples)
|
|
172
|
-
)
|
|
177
|
+
if not isinstance(examples, pd.DataFrame):
|
|
178
|
+
examples = pd.DataFrame(examples)
|
|
173
179
|
return self._create_dataset_via_flight(
|
|
174
180
|
name=name,
|
|
175
181
|
space_id=space_id,
|
|
176
|
-
examples=
|
|
182
|
+
examples=examples,
|
|
177
183
|
)
|
|
178
184
|
|
|
179
185
|
@prerelease_endpoint(key="datasets.get", stage=ReleaseStage.BETA)
|
|
@@ -205,7 +211,8 @@ class DatasetsClient:
|
|
|
205
211
|
Args:
|
|
206
212
|
dataset_id: Dataset ID to delete.
|
|
207
213
|
|
|
208
|
-
Returns:
|
|
214
|
+
Returns:
|
|
215
|
+
This method returns None on success (common empty 204 response).
|
|
209
216
|
|
|
210
217
|
Raises:
|
|
211
218
|
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
@@ -275,7 +282,11 @@ class DatasetsClient:
|
|
|
275
282
|
)
|
|
276
283
|
if dataset_df is not None:
|
|
277
284
|
return models.DatasetsExamplesList200Response(
|
|
278
|
-
|
|
285
|
+
# Cast: Pydantic validates and converts dicts to DatasetExample at runtime
|
|
286
|
+
examples=cast(
|
|
287
|
+
"list[models.DatasetExample]",
|
|
288
|
+
dataset_df.to_dict(orient="records"),
|
|
289
|
+
),
|
|
279
290
|
pagination=models.PaginationMetadata(
|
|
280
291
|
has_more=False, # Note that all=True
|
|
281
292
|
),
|
|
@@ -316,7 +327,11 @@ class DatasetsClient:
|
|
|
316
327
|
)
|
|
317
328
|
|
|
318
329
|
return models.DatasetsExamplesList200Response(
|
|
319
|
-
|
|
330
|
+
# Cast: Pydantic validates and converts dicts to DatasetExample at runtime
|
|
331
|
+
examples=cast(
|
|
332
|
+
"list[models.DatasetExample]",
|
|
333
|
+
dataset_df.to_dict(orient="records"),
|
|
334
|
+
),
|
|
320
335
|
pagination=models.PaginationMetadata(
|
|
321
336
|
has_more=False, # Note that all=True
|
|
322
337
|
),
|
|
@@ -331,7 +346,7 @@ class DatasetsClient:
|
|
|
331
346
|
*,
|
|
332
347
|
dataset_id: str,
|
|
333
348
|
dataset_version_id: str = "",
|
|
334
|
-
examples: list[dict[str, object]] | pd.DataFrame,
|
|
349
|
+
examples: builtins.list[dict[str, object]] | pd.DataFrame,
|
|
335
350
|
) -> models.Dataset:
|
|
336
351
|
"""Append new examples to an existing dataset.
|
|
337
352
|
|
|
@@ -354,14 +369,13 @@ class DatasetsClient:
|
|
|
354
369
|
the latest dataset version is selected.
|
|
355
370
|
examples: Examples to append, provided as either:
|
|
356
371
|
- a list of JSON-like dicts, or
|
|
357
|
-
- a pandas
|
|
372
|
+
- a :class:`pandas.DataFrame` (converted to records before upload).
|
|
358
373
|
|
|
359
374
|
Returns:
|
|
360
375
|
The updated dataset object. To see the examples, use `list_examples()`.
|
|
361
376
|
|
|
362
377
|
Raises:
|
|
363
|
-
AssertionError: If `examples` is not a list of dicts or a pandas
|
|
364
|
-
DataFrame.
|
|
378
|
+
AssertionError: If `examples` is not a list of dicts or a :class:`pandas.DataFrame`.
|
|
365
379
|
arize._generated.api_client.exceptions.ApiException: If the REST API
|
|
366
380
|
returns an error response (e.g. 400/401/403/404/429).
|
|
367
381
|
"""
|
|
@@ -373,11 +387,14 @@ class DatasetsClient:
|
|
|
373
387
|
)
|
|
374
388
|
|
|
375
389
|
data = (
|
|
376
|
-
examples.to_dict(orient="records")
|
|
390
|
+
examples.to_dict(orient="records")
|
|
377
391
|
if isinstance(examples, pd.DataFrame)
|
|
378
392
|
else examples
|
|
379
393
|
)
|
|
380
|
-
|
|
394
|
+
# Cast: pandas to_dict returns dict[Hashable, Any] but API requires dict[str, Any]
|
|
395
|
+
body = gen.DatasetsExamplesInsertRequest(
|
|
396
|
+
examples=cast("list[dict[str, Any]]", data)
|
|
397
|
+
)
|
|
381
398
|
|
|
382
399
|
return self._api.datasets_examples_insert(
|
|
383
400
|
dataset_id=dataset_id,
|
|
@@ -390,7 +407,7 @@ class DatasetsClient:
|
|
|
390
407
|
name: str,
|
|
391
408
|
space_id: str,
|
|
392
409
|
examples: pd.DataFrame,
|
|
393
|
-
) ->
|
|
410
|
+
) -> models.Dataset:
|
|
394
411
|
"""Internal method to create a dataset using Flight protocol for large example sets."""
|
|
395
412
|
data = examples.copy()
|
|
396
413
|
# Convert datetime columns to int64 (ms since epoch)
|
|
@@ -450,19 +467,19 @@ def _set_default_columns_for_dataset(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
450
467
|
"""Set default values for created_at and updated_at columns if missing or null."""
|
|
451
468
|
current_time = int(time.time() * 1000)
|
|
452
469
|
if "created_at" in df.columns:
|
|
453
|
-
if df["created_at"].isnull().
|
|
470
|
+
if df["created_at"].isnull().any():
|
|
454
471
|
df["created_at"].fillna(current_time, inplace=True)
|
|
455
472
|
else:
|
|
456
473
|
df["created_at"] = current_time
|
|
457
474
|
|
|
458
475
|
if "updated_at" in df.columns:
|
|
459
|
-
if df["updated_at"].isnull().
|
|
476
|
+
if df["updated_at"].isnull().any():
|
|
460
477
|
df["updated_at"].fillna(current_time, inplace=True)
|
|
461
478
|
else:
|
|
462
479
|
df["updated_at"] = current_time
|
|
463
480
|
|
|
464
481
|
if "id" in df.columns:
|
|
465
|
-
if df["id"].isnull().
|
|
482
|
+
if df["id"].isnull().any():
|
|
466
483
|
df["id"] = df["id"].apply(
|
|
467
484
|
lambda x: str(uuid.uuid4()) if pd.isnull(x) else x
|
|
468
485
|
)
|
arize/datasets/errors.py
CHANGED
|
@@ -80,7 +80,7 @@ class RequiredColumnsError(DatasetError):
|
|
|
80
80
|
|
|
81
81
|
|
|
82
82
|
class EmptyDatasetError(DatasetError):
|
|
83
|
-
"""Raised when dataset DataFrame has no rows."""
|
|
83
|
+
"""Raised when dataset :class:`pandas.DataFrame` has no rows."""
|
|
84
84
|
|
|
85
85
|
def error_message(self) -> str:
|
|
86
86
|
"""Return the error message for this exception."""
|
arize/datasets/validation.py
CHANGED
|
@@ -8,12 +8,12 @@ from arize.datasets import errors as err
|
|
|
8
8
|
def validate_dataset_df(
|
|
9
9
|
df: pd.DataFrame,
|
|
10
10
|
) -> list[err.DatasetError]:
|
|
11
|
-
"""Validate a dataset DataFrame for structural and content errors.
|
|
11
|
+
"""Validate a dataset :class:`pandas.DataFrame` for structural and content errors.
|
|
12
12
|
|
|
13
13
|
Checks for required columns, unique ID values, and non-empty data.
|
|
14
14
|
|
|
15
15
|
Args:
|
|
16
|
-
df: The pandas
|
|
16
|
+
df: The :class:`pandas.DataFrame` to validate.
|
|
17
17
|
|
|
18
18
|
Returns:
|
|
19
19
|
A list of DatasetError objects found during validation. Empty list if valid.
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""Automatic embedding generation factory for various ML use cases."""
|
|
2
2
|
|
|
3
|
+
from typing import TypeAlias
|
|
4
|
+
|
|
3
5
|
import pandas as pd
|
|
4
6
|
|
|
5
7
|
from arize.embeddings import constants
|
|
@@ -24,9 +26,14 @@ from arize.embeddings.nlp_generators import (
|
|
|
24
26
|
from arize.embeddings.tabular_generators import (
|
|
25
27
|
EmbeddingGeneratorForTabularFeatures,
|
|
26
28
|
)
|
|
27
|
-
from arize.embeddings.usecases import
|
|
29
|
+
from arize.embeddings.usecases import (
|
|
30
|
+
CVUseCases,
|
|
31
|
+
NLPUseCases,
|
|
32
|
+
TabularUseCases,
|
|
33
|
+
UseCases,
|
|
34
|
+
)
|
|
28
35
|
|
|
29
|
-
UseCaseLike = str |
|
|
36
|
+
UseCaseLike: TypeAlias = str | NLPUseCases | CVUseCases | TabularUseCases
|
|
30
37
|
|
|
31
38
|
|
|
32
39
|
class EmbeddingGenerator:
|
|
@@ -49,20 +56,20 @@ class EmbeddingGenerator:
|
|
|
49
56
|
) -> BaseEmbeddingGenerator:
|
|
50
57
|
"""Create an embedding generator for the specified use case."""
|
|
51
58
|
if use_case == UseCases.NLP.SEQUENCE_CLASSIFICATION:
|
|
52
|
-
return EmbeddingGeneratorForNLPSequenceClassification(**kwargs)
|
|
59
|
+
return EmbeddingGeneratorForNLPSequenceClassification(**kwargs) # type: ignore[arg-type]
|
|
53
60
|
if use_case == UseCases.NLP.SUMMARIZATION:
|
|
54
|
-
return EmbeddingGeneratorForNLPSummarization(**kwargs)
|
|
61
|
+
return EmbeddingGeneratorForNLPSummarization(**kwargs) # type: ignore[arg-type]
|
|
55
62
|
if use_case == UseCases.CV.IMAGE_CLASSIFICATION:
|
|
56
|
-
return EmbeddingGeneratorForCVImageClassification(**kwargs)
|
|
63
|
+
return EmbeddingGeneratorForCVImageClassification(**kwargs) # type: ignore[arg-type]
|
|
57
64
|
if use_case == UseCases.CV.OBJECT_DETECTION:
|
|
58
|
-
return EmbeddingGeneratorForCVObjectDetection(**kwargs)
|
|
65
|
+
return EmbeddingGeneratorForCVObjectDetection(**kwargs) # type: ignore[arg-type]
|
|
59
66
|
if use_case == UseCases.STRUCTURED.TABULAR_EMBEDDINGS:
|
|
60
|
-
return EmbeddingGeneratorForTabularFeatures(**kwargs)
|
|
67
|
+
return EmbeddingGeneratorForTabularFeatures(**kwargs) # type: ignore[arg-type]
|
|
61
68
|
raise ValueError(f"Invalid use case {use_case}")
|
|
62
69
|
|
|
63
70
|
@classmethod
|
|
64
71
|
def list_default_models(cls) -> pd.DataFrame:
|
|
65
|
-
"""Return a DataFrame of default models for each use case."""
|
|
72
|
+
"""Return a :class:`pandas.DataFrame` of default models for each use case."""
|
|
66
73
|
df = pd.DataFrame(
|
|
67
74
|
{
|
|
68
75
|
"Area": ["NLP", "NLP", "CV", "CV", "STRUCTURED"],
|
|
@@ -87,7 +94,7 @@ class EmbeddingGenerator:
|
|
|
87
94
|
|
|
88
95
|
@classmethod
|
|
89
96
|
def list_pretrained_models(cls) -> pd.DataFrame:
|
|
90
|
-
"""Return a DataFrame of all available pretrained models."""
|
|
97
|
+
"""Return a :class:`pandas.DataFrame` of all available pretrained models."""
|
|
91
98
|
data = {
|
|
92
99
|
"Task": ["NLP" for _ in NLP_PRETRAINED_MODELS]
|
|
93
100
|
+ ["CV" for _ in CV_PRETRAINED_MODELS],
|
|
@@ -14,11 +14,15 @@ try:
|
|
|
14
14
|
import torch
|
|
15
15
|
from datasets import Dataset
|
|
16
16
|
from PIL import Image
|
|
17
|
-
from transformers import (
|
|
17
|
+
from transformers import (
|
|
18
18
|
AutoImageProcessor,
|
|
19
19
|
AutoModel,
|
|
20
20
|
AutoTokenizer,
|
|
21
|
+
BaseImageProcessor,
|
|
21
22
|
BatchEncoding,
|
|
23
|
+
BatchFeature,
|
|
24
|
+
PreTrainedModel,
|
|
25
|
+
PreTrainedTokenizerBase,
|
|
22
26
|
)
|
|
23
27
|
from transformers.utils import logging as transformer_logging
|
|
24
28
|
except ImportError as e:
|
|
@@ -67,7 +71,9 @@ class BaseEmbeddingGenerator(ABC):
|
|
|
67
71
|
raise
|
|
68
72
|
|
|
69
73
|
@abstractmethod
|
|
70
|
-
def generate_embeddings(
|
|
74
|
+
def generate_embeddings(
|
|
75
|
+
self, **kwargs: object
|
|
76
|
+
) -> pd.Series | tuple[pd.Series, pd.Series]:
|
|
71
77
|
"""Generate embeddings for the input data."""
|
|
72
78
|
...
|
|
73
79
|
|
|
@@ -95,7 +101,7 @@ class BaseEmbeddingGenerator(ABC):
|
|
|
95
101
|
return self.__model_name
|
|
96
102
|
|
|
97
103
|
@property
|
|
98
|
-
def model(self) ->
|
|
104
|
+
def model(self) -> PreTrainedModel:
|
|
99
105
|
"""Return the underlying model instance."""
|
|
100
106
|
return self.__model
|
|
101
107
|
|
|
@@ -183,7 +189,7 @@ class NLPEmbeddingGenerator(BaseEmbeddingGenerator):
|
|
|
183
189
|
tokenizer_max_length: Maximum sequence length for the tokenizer.
|
|
184
190
|
**kwargs: Additional arguments for model initialization.
|
|
185
191
|
"""
|
|
186
|
-
super().__init__(use_case=use_case, model_name=model_name, **kwargs)
|
|
192
|
+
super().__init__(use_case=use_case, model_name=model_name, **kwargs) # type: ignore[arg-type]
|
|
187
193
|
self.__tokenizer_max_length = tokenizer_max_length
|
|
188
194
|
# We don't check for the tokenizer's existence since it is coupled with the corresponding model
|
|
189
195
|
# We check the model's existence in `BaseEmbeddingGenerator`
|
|
@@ -193,7 +199,7 @@ class NLPEmbeddingGenerator(BaseEmbeddingGenerator):
|
|
|
193
199
|
)
|
|
194
200
|
|
|
195
201
|
@property
|
|
196
|
-
def tokenizer(self) ->
|
|
202
|
+
def tokenizer(self) -> PreTrainedTokenizerBase:
|
|
197
203
|
"""Return the tokenizer instance for text processing."""
|
|
198
204
|
return self.__tokenizer
|
|
199
205
|
|
|
@@ -240,7 +246,7 @@ class CVEmbeddingGenerator(BaseEmbeddingGenerator):
|
|
|
240
246
|
model_name: Name of the pre-trained vision model.
|
|
241
247
|
**kwargs: Additional arguments for model initialization.
|
|
242
248
|
"""
|
|
243
|
-
super().__init__(use_case=use_case, model_name=model_name, **kwargs)
|
|
249
|
+
super().__init__(use_case=use_case, model_name=model_name, **kwargs) # type: ignore[arg-type]
|
|
244
250
|
logger.info("Downloading image processor")
|
|
245
251
|
# We don't check for the image processor's existence since it is coupled with the corresponding model
|
|
246
252
|
# We check the model's existence in `BaseEmbeddingGenerator`
|
|
@@ -249,7 +255,7 @@ class CVEmbeddingGenerator(BaseEmbeddingGenerator):
|
|
|
249
255
|
)
|
|
250
256
|
|
|
251
257
|
@property
|
|
252
|
-
def image_processor(self) ->
|
|
258
|
+
def image_processor(self) -> BaseImageProcessor:
|
|
253
259
|
"""Return the image processor instance for image preprocessing."""
|
|
254
260
|
return self.__image_processor
|
|
255
261
|
|
|
@@ -262,7 +268,7 @@ class CVEmbeddingGenerator(BaseEmbeddingGenerator):
|
|
|
262
268
|
|
|
263
269
|
def preprocess_image(
|
|
264
270
|
self, batch: dict[str, list[str]], local_image_feat_name: str
|
|
265
|
-
) ->
|
|
271
|
+
) -> BatchFeature:
|
|
266
272
|
"""Preprocess a batch of images for model input."""
|
|
267
273
|
return self.image_processor(
|
|
268
274
|
[
|
|
@@ -272,7 +278,7 @@ class CVEmbeddingGenerator(BaseEmbeddingGenerator):
|
|
|
272
278
|
return_tensors="pt",
|
|
273
279
|
).to(self.device)
|
|
274
280
|
|
|
275
|
-
def generate_embeddings(self, local_image_path_col: pd.Series) -> pd.Series:
|
|
281
|
+
def generate_embeddings(self, local_image_path_col: pd.Series) -> pd.Series: # type: ignore[override]
|
|
276
282
|
"""Obtain embedding vectors from your image data using pre-trained image models.
|
|
277
283
|
|
|
278
284
|
:param local_image_path_col: a pandas Series containing the local path to the images to
|
|
@@ -25,7 +25,7 @@ class EmbeddingGeneratorForCVImageClassification(CVEmbeddingGenerator):
|
|
|
25
25
|
super().__init__(
|
|
26
26
|
use_case=UseCases.CV.IMAGE_CLASSIFICATION,
|
|
27
27
|
model_name=model_name,
|
|
28
|
-
**kwargs,
|
|
28
|
+
**kwargs, # type: ignore[arg-type]
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
|
|
@@ -46,5 +46,5 @@ class EmbeddingGeneratorForCVObjectDetection(CVEmbeddingGenerator):
|
|
|
46
46
|
super().__init__(
|
|
47
47
|
use_case=UseCases.CV.OBJECT_DETECTION,
|
|
48
48
|
model_name=model_name,
|
|
49
|
-
**kwargs,
|
|
49
|
+
**kwargs, # type: ignore[arg-type]
|
|
50
50
|
)
|
arize/embeddings/errors.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class InvalidIndexError(Exception):
|
|
5
|
-
"""Raised when DataFrame or Series has an invalid index."""
|
|
5
|
+
"""Raised when :class:`pandas.DataFrame` or Series has an invalid index."""
|
|
6
6
|
|
|
7
7
|
def __repr__(self) -> str:
|
|
8
8
|
"""Return a string representation for debugging and logging."""
|
|
@@ -16,7 +16,7 @@ class InvalidIndexError(Exception):
|
|
|
16
16
|
"""Initialize the exception with field name context.
|
|
17
17
|
|
|
18
18
|
Args:
|
|
19
|
-
field_name: Name of the DataFrame or Series field with invalid index.
|
|
19
|
+
field_name: Name of the :class:`pandas.DataFrame` or Series field with invalid index.
|
|
20
20
|
"""
|
|
21
21
|
self.field_name = field_name
|
|
22
22
|
|
|
@@ -39,10 +39,10 @@ class EmbeddingGeneratorForNLPSequenceClassification(NLPEmbeddingGenerator):
|
|
|
39
39
|
super().__init__(
|
|
40
40
|
use_case=UseCases.NLP.SEQUENCE_CLASSIFICATION,
|
|
41
41
|
model_name=model_name,
|
|
42
|
-
**kwargs,
|
|
42
|
+
**kwargs, # type: ignore[arg-type]
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
def generate_embeddings(
|
|
45
|
+
def generate_embeddings( # type: ignore[override]
|
|
46
46
|
self,
|
|
47
47
|
text_col: pd.Series,
|
|
48
48
|
class_label_col: pd.Series | None = None,
|
|
@@ -65,10 +65,10 @@ class EmbeddingGeneratorForNLPSequenceClassification(NLPEmbeddingGenerator):
|
|
|
65
65
|
if class_label_col is not None:
|
|
66
66
|
if not isinstance(class_label_col, pd.Series):
|
|
67
67
|
raise TypeError("class_label_col must be a pandas Series")
|
|
68
|
-
|
|
68
|
+
temp_df = pd.concat(
|
|
69
69
|
{"text": text_col, "class_label": class_label_col}, axis=1
|
|
70
70
|
)
|
|
71
|
-
prepared_text_col =
|
|
71
|
+
prepared_text_col = temp_df.apply(
|
|
72
72
|
lambda row: f" The classification label is {row['class_label']}. {row['text']}",
|
|
73
73
|
axis=1,
|
|
74
74
|
)
|
|
@@ -83,8 +83,8 @@ class EmbeddingGeneratorForNLPSequenceClassification(NLPEmbeddingGenerator):
|
|
|
83
83
|
batched=True,
|
|
84
84
|
batch_size=self.batch_size,
|
|
85
85
|
)
|
|
86
|
-
|
|
87
|
-
return
|
|
86
|
+
result_df: pd.DataFrame = ds.to_pandas()
|
|
87
|
+
return result_df["embedding_vector"]
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
class EmbeddingGeneratorForNLPSummarization(NLPEmbeddingGenerator):
|
|
@@ -104,10 +104,10 @@ class EmbeddingGeneratorForNLPSummarization(NLPEmbeddingGenerator):
|
|
|
104
104
|
super().__init__(
|
|
105
105
|
use_case=UseCases.NLP.SUMMARIZATION,
|
|
106
106
|
model_name=model_name,
|
|
107
|
-
**kwargs,
|
|
107
|
+
**kwargs, # type: ignore[arg-type]
|
|
108
108
|
)
|
|
109
109
|
|
|
110
|
-
def generate_embeddings(
|
|
110
|
+
def generate_embeddings( # type: ignore[override]
|
|
111
111
|
self,
|
|
112
112
|
text_col: pd.Series,
|
|
113
113
|
) -> pd.Series:
|
|
@@ -64,10 +64,10 @@ class EmbeddingGeneratorForTabularFeatures(NLPEmbeddingGenerator):
|
|
|
64
64
|
super().__init__(
|
|
65
65
|
use_case=UseCases.STRUCTURED.TABULAR_EMBEDDINGS,
|
|
66
66
|
model_name=model_name,
|
|
67
|
-
**kwargs,
|
|
67
|
+
**kwargs, # type: ignore[arg-type]
|
|
68
68
|
)
|
|
69
69
|
|
|
70
|
-
def generate_embeddings(
|
|
70
|
+
def generate_embeddings( # type: ignore[override]
|
|
71
71
|
self,
|
|
72
72
|
df: pd.DataFrame,
|
|
73
73
|
selected_columns: list[str],
|
|
@@ -145,11 +145,11 @@ class EmbeddingGeneratorForTabularFeatures(NLPEmbeddingGenerator):
|
|
|
145
145
|
batch_size=self.batch_size,
|
|
146
146
|
)
|
|
147
147
|
|
|
148
|
-
|
|
148
|
+
result_df: pd.DataFrame = ds.to_pandas()
|
|
149
149
|
if return_prompt_col:
|
|
150
|
-
return
|
|
150
|
+
return result_df["embedding_vector"], prompts
|
|
151
151
|
|
|
152
|
-
return
|
|
152
|
+
return result_df["embedding_vector"]
|
|
153
153
|
|
|
154
154
|
@staticmethod
|
|
155
155
|
def __prompt_fn(row: pd.DataFrame, columns: list[str]) -> str:
|
|
@@ -172,5 +172,5 @@ class EmbeddingGeneratorForTabularFeatures(NLPEmbeddingGenerator):
|
|
|
172
172
|
|
|
173
173
|
@staticmethod
|
|
174
174
|
def list_pretrained_models() -> pd.DataFrame:
|
|
175
|
-
"""Return a DataFrame of available pretrained tabular models."""
|
|
175
|
+
"""Return a :class:`pandas.DataFrame` of available pretrained tabular models."""
|
|
176
176
|
return pd.DataFrame({"Model Name": sorted(TABULAR_PRETRAINED_MODELS)})
|
arize/exceptions/base.py
CHANGED
|
@@ -39,21 +39,6 @@ class ValidationFailure(Exception):
|
|
|
39
39
|
self.errors = errors
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
# ----------------------
|
|
43
|
-
# Minimum required checks
|
|
44
|
-
# ----------------------
|
|
45
|
-
# class InvalidColumnNameEmptyString(ValidationError):
|
|
46
|
-
# def __repr__(self) -> str:
|
|
47
|
-
# return "Invalid_Column_Name_Empty_String"
|
|
48
|
-
#
|
|
49
|
-
# def error_message(self) -> str:
|
|
50
|
-
# return (
|
|
51
|
-
# "Empty column name found: ''. The schema cannot point to columns in the "
|
|
52
|
-
# "dataframe denoted by an empty string. You can see the columns used in the "
|
|
53
|
-
# "schema by running schema.get_used_columns()"
|
|
54
|
-
# )
|
|
55
|
-
|
|
56
|
-
|
|
57
42
|
class InvalidFieldTypeConversion(ValidationError):
|
|
58
43
|
"""Raised when fields cannot be converted to required type."""
|
|
59
44
|
|
|
@@ -79,31 +64,6 @@ class InvalidFieldTypeConversion(ValidationError):
|
|
|
79
64
|
)
|
|
80
65
|
|
|
81
66
|
|
|
82
|
-
# class InvalidFieldTypeEmbeddingFeatures(ValidationError):
|
|
83
|
-
# def __repr__(self) -> str:
|
|
84
|
-
# return "Invalid_Input_Type_Embedding_Features"
|
|
85
|
-
#
|
|
86
|
-
# def __init__(self) -> None:
|
|
87
|
-
# pass
|
|
88
|
-
#
|
|
89
|
-
# def error_message(self) -> str:
|
|
90
|
-
# return (
|
|
91
|
-
# "schema.embedding_feature_column_names should be a dictionary mapping strings "
|
|
92
|
-
# "to EmbeddingColumnNames objects"
|
|
93
|
-
# )
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
# class InvalidFieldTypePromptResponse(ValidationError):
|
|
97
|
-
# def __repr__(self) -> str:
|
|
98
|
-
# return "Invalid_Input_Type_Prompt_Response"
|
|
99
|
-
#
|
|
100
|
-
# def __init__(self, name: str) -> None:
|
|
101
|
-
# self.name = name
|
|
102
|
-
#
|
|
103
|
-
# def error_message(self) -> str:
|
|
104
|
-
# return f"'{self.name}' must be of type str or EmbeddingColumnNames"
|
|
105
|
-
|
|
106
|
-
|
|
107
67
|
class InvalidDataFrameIndex(ValidationError):
|
|
108
68
|
"""Raised when DataFrame has an invalid index that needs to be reset."""
|
|
109
69
|
|
|
@@ -117,15 +77,3 @@ class InvalidDataFrameIndex(ValidationError):
|
|
|
117
77
|
"The index of the dataframe is invalid; "
|
|
118
78
|
"reset the index by using df.reset_index(drop=True, inplace=True)"
|
|
119
79
|
)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
# class InvalidSchemaType(ValidationError):
|
|
123
|
-
# def __repr__(self) -> str:
|
|
124
|
-
# return "Invalid_Schema_Type"
|
|
125
|
-
#
|
|
126
|
-
# def __init__(self, schema_type: str, environment: Environments) -> None:
|
|
127
|
-
# self.schema_type = schema_type
|
|
128
|
-
# self.environment = environment
|
|
129
|
-
#
|
|
130
|
-
# def error_message(self) -> str:
|
|
131
|
-
# return f"Cannot use a {self.schema_type} for a model with environment: {self.environment}"
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Configuration validation exceptions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MultipleEndpointOverridesError(Exception):
|
|
7
|
+
"""Raised when multiple endpoint override options are provided.
|
|
8
|
+
|
|
9
|
+
Only one of the following can be specified: region, single_host/single_port, or base_domain.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, message: str) -> None:
|
|
13
|
+
"""Initialize the exception with an optional custom message.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
message: Custom error message, or empty string.
|
|
17
|
+
"""
|
|
18
|
+
self.message = message
|
|
19
|
+
|
|
20
|
+
def __str__(self) -> str:
|
|
21
|
+
"""Return the error message."""
|
|
22
|
+
return self.message
|