PyPI - arize - Versions diffs - 8.0.0b2__py3-none-any.whl → 8.0.1__py3-none-any.whl - Mend

arize 8.0.0b2py3-none-any.whl → 8.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

arize/__init__.py +8 -1
arize/_exporter/client.py +18 -17
arize/_exporter/parsers/tracing_data_parser.py +9 -4
arize/_exporter/validation.py +1 -1
arize/_flight/client.py +33 -13
arize/_lazy.py +37 -2
arize/client.py +61 -35
arize/config.py +168 -14
arize/constants/config.py +1 -0
arize/datasets/client.py +32 -19
arize/embeddings/auto_generator.py +14 -7
arize/embeddings/base_generators.py +15 -9
arize/embeddings/cv_generators.py +2 -2
arize/embeddings/nlp_generators.py +8 -8
arize/embeddings/tabular_generators.py +5 -5
arize/exceptions/config.py +22 -0
arize/exceptions/parameters.py +1 -1
arize/exceptions/values.py +8 -5
arize/experiments/__init__.py +4 -0
arize/experiments/client.py +17 -11
arize/experiments/evaluators/base.py +6 -3
arize/experiments/evaluators/executors.py +6 -4
arize/experiments/evaluators/rate_limiters.py +3 -1
arize/experiments/evaluators/types.py +7 -5
arize/experiments/evaluators/utils.py +7 -5
arize/experiments/functions.py +111 -48
arize/experiments/tracing.py +4 -1
arize/experiments/types.py +31 -26
arize/logging.py +53 -32
arize/ml/batch_validation/validator.py +82 -70
arize/ml/bounded_executor.py +25 -6
arize/ml/casting.py +45 -27
arize/ml/client.py +35 -28
arize/ml/proto.py +16 -17
arize/ml/stream_validation.py +63 -25
arize/ml/surrogate_explainer/mimic.py +15 -7
arize/ml/types.py +26 -12
arize/pre_releases.py +7 -6
arize/py.typed +0 -0
arize/regions.py +10 -10
arize/spans/client.py +113 -21
arize/spans/conversion.py +7 -5
arize/spans/validation/annotations/dataframe_form_validation.py +1 -1
arize/spans/validation/annotations/value_validation.py +11 -14
arize/spans/validation/common/dataframe_form_validation.py +1 -1
arize/spans/validation/common/value_validation.py +10 -13
arize/spans/validation/evals/value_validation.py +1 -1
arize/spans/validation/metadata/argument_validation.py +1 -1
arize/spans/validation/metadata/dataframe_form_validation.py +1 -1
arize/spans/validation/metadata/value_validation.py +23 -1
arize/utils/arrow.py +37 -1
arize/utils/online_tasks/dataframe_preprocessor.py +8 -4
arize/utils/proto.py +0 -1
arize/utils/types.py +6 -6
arize/version.py +1 -1
{arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/METADATA +18 -3
{arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/RECORD +60 -58
{arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/WHEEL +0 -0
{arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/licenses/LICENSE +0 -0
{arize-8.0.0b2.dist-info → arize-8.0.1.dist-info}/licenses/NOTICE +0 -0

arize/config.py CHANGED Viewed

@@ -25,6 +25,7 @@ from arize.constants.config import (
     ENV_API_KEY,
     ENV_API_SCHEME,
     ENV_ARIZE_DIRECTORY,
+    ENV_BASE_DOMAIN,
     ENV_ENABLE_CACHING,
     ENV_FLIGHT_HOST,
     ENV_FLIGHT_PORT,
@@ -42,6 +43,7 @@ from arize.constants.config import (
 )
 from arize.constants.pyarrow import MAX_CHUNKSIZE
 from arize.exceptions.auth import MissingAPIKeyError
+from arize.exceptions.config import MultipleEndpointOverridesError
 from arize.regions import REGION_ENDPOINTS, Region
 from arize.version import __version__
@@ -53,18 +55,44 @@ ALLOWED_HTTP_SCHEMES = {"http", "https"}
 def _is_sensitive_field(name: str) -> bool:
+    """Check if a field name contains sensitive information markers.
+    Args:
+        name: The field name to check.
+    Returns:
+        bool: True if the field name contains 'key', 'token', or 'secret' (case-insensitive).
+    """
     n = name.lower()
     return bool(any(k in n for k in SENSITIVE_FIELD_MARKERS))
 def _mask_secret(secret: str, N: int = 4) -> str:
-    """Show first N chars then '***'; empty string if empty."""
+    """Mask a secret string by showing only the first N characters.
+    Args:
+        secret: The secret string to mask.
+        N: Number of characters to show before masking. Defaults to 4.
+    Returns:
+        str: The masked string (first N chars + '***'), or empty string if input is empty.
+    """
     if len(secret) == 0:
         return ""
     return f"{secret[:N]}***"
 def _endpoint(scheme: str, base: str, path: str = "") -> str:
+    """Construct a full endpoint URL from scheme, base, and optional path.
+    Args:
+        scheme: The URL scheme (e.g., "http", "https").
+        base: The base URL or hostname.
+        path: Optional path to append to the base URL. Defaults to empty string.
+    Returns:
+        str: The fully constructed endpoint URL.
+    """
     endpoint = scheme + "://" + base.rstrip("/")
     if path:
         endpoint += "/" + path.lstrip("/")
@@ -72,6 +100,18 @@ def _endpoint(scheme: str, base: str, path: str = "") -> str:
 def _env_http_scheme(name: str, default: str) -> str:
+    """Get an HTTP scheme from environment variable with validation.
+    Args:
+        name: The environment variable name.
+        default: The default value if the environment variable is not set.
+    Returns:
+        str: The validated HTTP scheme ('http' or 'https').
+    Raises:
+        ValueError: If the scheme is not 'http' or 'https'.
+    """
     v = _env_str(name, default).lower()
     if v not in ALLOWED_HTTP_SCHEMES:
         raise ValueError(
@@ -86,6 +126,20 @@ def _env_str(
     min_len: int | None = None,
     max_len: int | None = None,
 ) -> str:
+    """Get a string value from environment variable with length validation.
+    Args:
+        name: The environment variable name.
+        default: The default value if the environment variable is not set.
+        min_len: Optional minimum length constraint for the string.
+        max_len: Optional maximum length constraint for the string.
+    Returns:
+        str: The validated string value (stripped of whitespace).
+    Raises:
+        ValueError: If the string length violates min_len or max_len constraints.
+    """
     val = os.getenv(name, default).strip()
     if min_len is not None and len(val) < min_len:
@@ -107,6 +161,20 @@ def _env_int(
     min_val: int | None = None,
     max_val: int | None = None,
 ) -> int:
+    """Get an integer value from environment variable with range validation.
+    Args:
+        name: The environment variable name.
+        default: The default value if the environment variable is not set.
+        min_val: Optional minimum value constraint for the integer.
+        max_val: Optional maximum value constraint for the integer.
+    Returns:
+        int: The validated integer value.
+    Raises:
+        ValueError: If the value cannot be parsed as an integer or violates min_val/max_val constraints.
+    """
     raw = os.getenv(name, default)
     try:
         val = int(raw)
@@ -132,6 +200,20 @@ def _env_float(
     min_val: float | None = None,
     max_val: float | None = None,
 ) -> float:
+    """Get a float value from environment variable with range validation.
+    Args:
+        name: The environment variable name.
+        default: The default value if the environment variable is not set.
+        min_val: Optional minimum value constraint for the float.
+        max_val: Optional maximum value constraint for the float.
+    Returns:
+        float: The validated float value.
+    Raises:
+        ValueError: If the value cannot be parsed as a float or violates min_val/max_val constraints.
+    """
     raw = os.getenv(name, default)
     try:
         val = float(raw)
@@ -152,10 +234,28 @@ def _env_float(
 def _env_bool(name: str, default: bool) -> bool:
+    """Get a boolean value from environment variable.
+    Args:
+        name: The environment variable name.
+        default: The default boolean value if the environment variable is not set.
+    Returns:
+        bool: The parsed boolean value.
+    """
     return _parse_bool(os.getenv(name, str(default)))
 def _parse_bool(val: bool | str | None) -> bool:
+    """Parse a boolean value from various input types.
+    Args:
+        val: The value to parse. Can be a bool, string, or None.
+    Returns:
+        bool: True if the value is already True or matches one of the truthy strings
+            ('1', 'true', 'yes', 'on', case-insensitive). False otherwise.
+    """
     if isinstance(val, bool):
         return val
     return (val or "").strip().lower() in {"1", "true", "yes", "on"}
@@ -227,15 +327,27 @@ class SDKConfiguration:
             individual host/port settings.
             Environment variable: ARIZE_REGION.
             Default: :class:`Region.UNSET`.
-        single_host: Single host to use for all endpoints. Overrides individual host settings.
+        single_host: Single host to use for all endpoints. When specified, overrides
+            individual host settings.
             Environment variable: ARIZE_SINGLE_HOST.
             Default: "" (not set).
-        single_port: Single port to use for all endpoints. Overrides individual port settings (0-65535).
+        single_port: Single port to use for all endpoints. When specified, overrides
+            individual port settings (0-65535).
             Environment variable: ARIZE_SINGLE_PORT.
             Default: 0 (not set).
+        base_domain: Base domain for generating all endpoint hosts. Intended for Private Connect
+            setups. When specified, generates hosts as api.<base_domain>, otlp.<base_domain>,
+            flight.<base_domain>. When specified, overrides individual host settings.
+            Environment variable: ARIZE_BASE_DOMAIN.
+            Default: "" (not set).
+    Note:
+        The endpoint override options (region, single_host/single_port, base_domain) are
+        mutually exclusive. Specifying more than one will raise MultipleEndpointOverridesError.
     Raises:
         MissingAPIKeyError: If api_key is not provided via argument or environment variable.
+        MultipleEndpointOverridesError: If multiple endpoint override options are provided.
     """
     api_key: str = field(
@@ -326,27 +438,73 @@ class SDKConfiguration:
             ENV_SINGLE_PORT, 0, min_val=0, max_val=65535
         )
     )
+    base_domain: str = field(
+        default_factory=lambda: _env_str(ENV_BASE_DOMAIN, "")
+    )
     def __post_init__(self) -> None:
         """Validate and configure SDK endpoints after initialization.
+        Endpoint override options are mutually exclusive. Only one of the following
+        can be specified:
+        1. region - Overrides all via REGION_ENDPOINTS mapping
+        2. single_host/single_port - Overrides individual hosts/ports
+        3. base_domain - Generates hosts from base domain
+        If none are specified, per-endpoint host/port settings are used.
         Raises:
-            MissingAPIKeyError: If api_key is not provided via argument or environment variable.
+            MissingAPIKeyError: If api_key is not provided.
+            MultipleEndpointOverridesError: If multiple endpoint override options are provided.
         """
-        # Validate Configuration
+        # Validate configuration
         if not self.api_key:
             raise MissingAPIKeyError()
+        # Check which override options are set
+        has_base_domain = bool(self.base_domain)
         has_single_host = bool(self.single_host)
         has_single_port = self.single_port != 0
         has_region = self.region is not Region.UNSET
-        if (has_single_host or has_single_port) and has_region:
+        # Ensure only one override method is used (mutually exclusive)
+        override_count = sum(
+            [has_base_domain, has_single_host or has_single_port, has_region]
+        )
+        if override_count > 1:
+            # Determine which overrides were provided
+            provided_overrides = []
+            if has_region:
+                provided_overrides.append(f"region={self.region.value}")
+            if has_single_host or has_single_port:
+                if has_single_host:
+                    provided_overrides.append(
+                        f"single_host={self.single_host!r}"
+                    )
+                if has_single_port:
+                    provided_overrides.append(f"single_port={self.single_port}")
+            if has_base_domain:
+                provided_overrides.append(f"base_domain={self.base_domain!r}")
+            error_message = (
+                f"Multiple endpoint override options provided: {', '.join(provided_overrides)}. "
+                "Only one of the following can be specified: 'region', "
+                "'single_host'/'single_port', or 'base_domain'."
+            )
+            logger.error(error_message)
+            raise MultipleEndpointOverridesError(error_message)
+        if has_base_domain:
             logger.info(
-                "Multiple endpoint override options provided. Preference order is: "
-                "region > single_host/single_port > per-endpoint host/port."
+                "Base domain %r provided; generating hosts from base domain.",
+                self.base_domain,
+            )
+            object.__setattr__(self, "api_host", f"api.{self.base_domain}")
+            object.__setattr__(self, "otlp_host", f"otlp.{self.base_domain}")
+            object.__setattr__(
+                self, "flight_host", f"flight.{self.base_domain}"
             )
-        # Single host override: if single_host is set, it overrides hosts
         if has_single_host:
             logger.info(
                 "Single host %r provided; overriding hosts configuration with single host.",
@@ -356,7 +514,6 @@ class SDKConfiguration:
             object.__setattr__(self, "otlp_host", self.single_host)
             object.__setattr__(self, "flight_host", self.single_host)
-        # Single port override: if single_port is set, it overrides ports
         if has_single_port:
             logger.info(
                 "Single port %s provided; overriding ports configuration with single port.",
@@ -364,15 +521,12 @@ class SDKConfiguration:
             )
             object.__setattr__(self, "flight_port", self.single_port)
-        # Region override: if region is set, it *always* wins over host/port fields
         if has_region:
-            endpoints = REGION_ENDPOINTS[self.region]
-            # Override config (region trumps everything)
             logger.info(
                 "Region %s provided; overriding hosts & ports configuration with region defaults.",
                 self.region.value,
             )
+            endpoints = REGION_ENDPOINTS[self.region]
             object.__setattr__(self, "api_host", endpoints.api_host)
             object.__setattr__(self, "otlp_host", endpoints.otlp_host)
             object.__setattr__(self, "flight_host", endpoints.flight_host)

arize/constants/config.py CHANGED Viewed

@@ -14,6 +14,7 @@ ENV_FLIGHT_PORT = "ARIZE_FLIGHT_PORT"
 ENV_FLIGHT_SCHEME = "ARIZE_FLIGHT_SCHEME"
 ENV_SINGLE_HOST = "ARIZE_SINGLE_HOST"
 ENV_SINGLE_PORT = "ARIZE_SINGLE_PORT"
+ENV_BASE_DOMAIN = "ARIZE_BASE_DOMAIN"
 ENV_PYARROW_MAX_CHUNKSIZE = "ARIZE_MAX_CHUNKSIZE"
 ENV_REQUEST_VERIFY = "ARIZE_REQUEST_VERIFY"
 ENV_MAX_HTTP_PAYLOAD_SIZE_MB = "ARIZE_MAX_HTTP_PAYLOAD_SIZE_MB"

arize/datasets/client.py CHANGED Viewed

@@ -5,7 +5,7 @@ from __future__ import annotations
 import logging
 import time
 import uuid
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, cast
 import pandas as pd
 import pyarrow as pa
@@ -24,6 +24,10 @@ from arize.utils.openinference_conversion import (
 from arize.utils.size import get_payload_size_mb
 if TYPE_CHECKING:
+    # builtins is needed to use builtins.list in type annotations because
+    # the class has a list() method that shadows the built-in list type
+    import builtins
     from arize._generated.api_client.api_client import ApiClient
     from arize.config import SDKConfiguration
@@ -97,7 +101,7 @@ class DatasetsClient:
         *,
         name: str,
         space_id: str,
-        examples: list[dict[str, object]] | pd.DataFrame,
+        examples: builtins.list[dict[str, object]] | pd.DataFrame,
         force_http: bool = False,
     ) -> models.Dataset:
         """Create a dataset with JSON examples.
@@ -150,7 +154,7 @@ class DatasetsClient:
             from arize._generated import api_client as gen
             data = (
-                examples.to_dict(orient="records")  # type: ignore
+                examples.to_dict(orient="records")
                 if isinstance(examples, pd.DataFrame)
                 else examples
             )
@@ -158,7 +162,8 @@ class DatasetsClient:
             body = gen.DatasetsCreateRequest(
                 name=name,
                 space_id=space_id,
-                examples=data,
+                # Cast: pandas to_dict returns dict[Hashable, Any] but API requires dict[str, Any]
+                examples=cast("list[dict[str, Any]]", data),
             )
             return self._api.datasets_create(datasets_create_request=body)
@@ -169,15 +174,12 @@ class DatasetsClient:
             "Trying to convert to DataFrame for more efficient upload via "
             "gRPC + Flight."
         )
-        data = (
-            examples
-            if isinstance(examples, pd.DataFrame)
-            else pd.DataFrame(examples)
-        )
+        if not isinstance(examples, pd.DataFrame):
+            examples = pd.DataFrame(examples)
         return self._create_dataset_via_flight(
             name=name,
             space_id=space_id,
-            examples=data,
+            examples=examples,
         )
     @prerelease_endpoint(key="datasets.get", stage=ReleaseStage.BETA)
@@ -280,7 +282,11 @@ class DatasetsClient:
             )
         if dataset_df is not None:
             return models.DatasetsExamplesList200Response(
-                examples=dataset_df.to_dict(orient="records"),  # type: ignore
+                # Cast: Pydantic validates and converts dicts to DatasetExample at runtime
+                examples=cast(
+                    "list[models.DatasetExample]",
+                    dataset_df.to_dict(orient="records"),
+                ),
                 pagination=models.PaginationMetadata(
                     has_more=False,  # Note that all=True
                 ),
@@ -321,7 +327,11 @@ class DatasetsClient:
         )
         return models.DatasetsExamplesList200Response(
-            examples=dataset_df.to_dict(orient="records"),  # type: ignore
+            # Cast: Pydantic validates and converts dicts to DatasetExample at runtime
+            examples=cast(
+                "list[models.DatasetExample]",
+                dataset_df.to_dict(orient="records"),
+            ),
             pagination=models.PaginationMetadata(
                 has_more=False,  # Note that all=True
             ),
@@ -336,7 +346,7 @@ class DatasetsClient:
         *,
         dataset_id: str,
         dataset_version_id: str = "",
-        examples: list[dict[str, object]] | pd.DataFrame,
+        examples: builtins.list[dict[str, object]] | pd.DataFrame,
     ) -> models.Dataset:
         """Append new examples to an existing dataset.
@@ -377,11 +387,14 @@ class DatasetsClient:
             )
         data = (
-            examples.to_dict(orient="records")  # type: ignore
+            examples.to_dict(orient="records")
             if isinstance(examples, pd.DataFrame)
             else examples
         )
-        body = gen.DatasetsExamplesInsertRequest(examples=data)
+        # Cast: pandas to_dict returns dict[Hashable, Any] but API requires dict[str, Any]
+        body = gen.DatasetsExamplesInsertRequest(
+            examples=cast("list[dict[str, Any]]", data)
+        )
         return self._api.datasets_examples_insert(
             dataset_id=dataset_id,
@@ -394,7 +407,7 @@ class DatasetsClient:
         name: str,
         space_id: str,
         examples: pd.DataFrame,
-    ) -> object:
+    ) -> models.Dataset:
         """Internal method to create a dataset using Flight protocol for large example sets."""
         data = examples.copy()
         # Convert datetime columns to int64 (ms since epoch)
@@ -454,19 +467,19 @@ def _set_default_columns_for_dataset(df: pd.DataFrame) -> pd.DataFrame:
     """Set default values for created_at and updated_at columns if missing or null."""
     current_time = int(time.time() * 1000)
     if "created_at" in df.columns:
-        if df["created_at"].isnull().values.any():  # type: ignore
+        if df["created_at"].isnull().any():
             df["created_at"].fillna(current_time, inplace=True)
     else:
         df["created_at"] = current_time
     if "updated_at" in df.columns:
-        if df["updated_at"].isnull().values.any():  # type: ignore
+        if df["updated_at"].isnull().any():
             df["updated_at"].fillna(current_time, inplace=True)
     else:
         df["updated_at"] = current_time
     if "id" in df.columns:
-        if df["id"].isnull().values.any():  # type: ignore
+        if df["id"].isnull().any():
             df["id"] = df["id"].apply(
                 lambda x: str(uuid.uuid4()) if pd.isnull(x) else x
             )

arize/embeddings/auto_generator.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """Automatic embedding generation factory for various ML use cases."""
+from typing import TypeAlias
 import pandas as pd
 from arize.embeddings import constants
@@ -24,9 +26,14 @@ from arize.embeddings.nlp_generators import (
 from arize.embeddings.tabular_generators import (
     EmbeddingGeneratorForTabularFeatures,
 )
-from arize.embeddings.usecases import UseCases
+from arize.embeddings.usecases import (
+    CVUseCases,
+    NLPUseCases,
+    TabularUseCases,
+    UseCases,
+)
-UseCaseLike = str | UseCases.NLP | UseCases.CV | UseCases.STRUCTURED
+UseCaseLike: TypeAlias = str | NLPUseCases | CVUseCases | TabularUseCases
 class EmbeddingGenerator:
@@ -49,15 +56,15 @@ class EmbeddingGenerator:
     ) -> BaseEmbeddingGenerator:
         """Create an embedding generator for the specified use case."""
         if use_case == UseCases.NLP.SEQUENCE_CLASSIFICATION:
-            return EmbeddingGeneratorForNLPSequenceClassification(**kwargs)
+            return EmbeddingGeneratorForNLPSequenceClassification(**kwargs)  # type: ignore[arg-type]
         if use_case == UseCases.NLP.SUMMARIZATION:
-            return EmbeddingGeneratorForNLPSummarization(**kwargs)
+            return EmbeddingGeneratorForNLPSummarization(**kwargs)  # type: ignore[arg-type]
         if use_case == UseCases.CV.IMAGE_CLASSIFICATION:
-            return EmbeddingGeneratorForCVImageClassification(**kwargs)
+            return EmbeddingGeneratorForCVImageClassification(**kwargs)  # type: ignore[arg-type]
         if use_case == UseCases.CV.OBJECT_DETECTION:
-            return EmbeddingGeneratorForCVObjectDetection(**kwargs)
+            return EmbeddingGeneratorForCVObjectDetection(**kwargs)  # type: ignore[arg-type]
         if use_case == UseCases.STRUCTURED.TABULAR_EMBEDDINGS:
-            return EmbeddingGeneratorForTabularFeatures(**kwargs)
+            return EmbeddingGeneratorForTabularFeatures(**kwargs)  # type: ignore[arg-type]
         raise ValueError(f"Invalid use case {use_case}")
     @classmethod

arize/embeddings/base_generators.py CHANGED Viewed

@@ -14,11 +14,15 @@ try:
     import torch
     from datasets import Dataset
     from PIL import Image
-    from transformers import (  # type: ignore
+    from transformers import (
         AutoImageProcessor,
         AutoModel,
         AutoTokenizer,
+        BaseImageProcessor,
         BatchEncoding,
+        BatchFeature,
+        PreTrainedModel,
+        PreTrainedTokenizerBase,
     )
     from transformers.utils import logging as transformer_logging
 except ImportError as e:
@@ -67,7 +71,9 @@ class BaseEmbeddingGenerator(ABC):
             raise
     @abstractmethod
-    def generate_embeddings(self, **kwargs: object) -> pd.Series:
+    def generate_embeddings(
+        self, **kwargs: object
+    ) -> pd.Series | tuple[pd.Series, pd.Series]:
         """Generate embeddings for the input data."""
         ...
@@ -95,7 +101,7 @@ class BaseEmbeddingGenerator(ABC):
         return self.__model_name
     @property
-    def model(self) -> object:
+    def model(self) -> PreTrainedModel:
         """Return the underlying model instance."""
         return self.__model
@@ -183,7 +189,7 @@ class NLPEmbeddingGenerator(BaseEmbeddingGenerator):
             tokenizer_max_length: Maximum sequence length for the tokenizer.
             **kwargs: Additional arguments for model initialization.
         """
-        super().__init__(use_case=use_case, model_name=model_name, **kwargs)
+        super().__init__(use_case=use_case, model_name=model_name, **kwargs)  # type: ignore[arg-type]
         self.__tokenizer_max_length = tokenizer_max_length
         # We don't check for the tokenizer's existence since it is coupled with the corresponding model
         # We check the model's existence in `BaseEmbeddingGenerator`
@@ -193,7 +199,7 @@ class NLPEmbeddingGenerator(BaseEmbeddingGenerator):
         )
     @property
-    def tokenizer(self) -> object:
+    def tokenizer(self) -> PreTrainedTokenizerBase:
         """Return the tokenizer instance for text processing."""
         return self.__tokenizer
@@ -240,7 +246,7 @@ class CVEmbeddingGenerator(BaseEmbeddingGenerator):
             model_name: Name of the pre-trained vision model.
             **kwargs: Additional arguments for model initialization.
         """
-        super().__init__(use_case=use_case, model_name=model_name, **kwargs)
+        super().__init__(use_case=use_case, model_name=model_name, **kwargs)  # type: ignore[arg-type]
         logger.info("Downloading image processor")
         # We don't check for the image processor's existence since it is coupled with the corresponding model
         # We check the model's existence in `BaseEmbeddingGenerator`
@@ -249,7 +255,7 @@ class CVEmbeddingGenerator(BaseEmbeddingGenerator):
         )
     @property
-    def image_processor(self) -> object:
+    def image_processor(self) -> BaseImageProcessor:
         """Return the image processor instance for image preprocessing."""
         return self.__image_processor
@@ -262,7 +268,7 @@ class CVEmbeddingGenerator(BaseEmbeddingGenerator):
     def preprocess_image(
         self, batch: dict[str, list[str]], local_image_feat_name: str
-    ) -> object:
+    ) -> BatchFeature:
         """Preprocess a batch of images for model input."""
         return self.image_processor(
             [
@@ -272,7 +278,7 @@ class CVEmbeddingGenerator(BaseEmbeddingGenerator):
             return_tensors="pt",
         ).to(self.device)
-    def generate_embeddings(self, local_image_path_col: pd.Series) -> pd.Series:
+    def generate_embeddings(self, local_image_path_col: pd.Series) -> pd.Series:  # type: ignore[override]
         """Obtain embedding vectors from your image data using pre-trained image models.
         :param local_image_path_col: a pandas Series containing the local path to the images to

arize/embeddings/cv_generators.py CHANGED Viewed

@@ -25,7 +25,7 @@ class EmbeddingGeneratorForCVImageClassification(CVEmbeddingGenerator):
         super().__init__(
             use_case=UseCases.CV.IMAGE_CLASSIFICATION,
             model_name=model_name,
-            **kwargs,
+            **kwargs,  # type: ignore[arg-type]
         )
@@ -46,5 +46,5 @@ class EmbeddingGeneratorForCVObjectDetection(CVEmbeddingGenerator):
         super().__init__(
             use_case=UseCases.CV.OBJECT_DETECTION,
             model_name=model_name,
-            **kwargs,
+            **kwargs,  # type: ignore[arg-type]
         )

arize/embeddings/nlp_generators.py CHANGED Viewed

@@ -39,10 +39,10 @@ class EmbeddingGeneratorForNLPSequenceClassification(NLPEmbeddingGenerator):
         super().__init__(
             use_case=UseCases.NLP.SEQUENCE_CLASSIFICATION,
             model_name=model_name,
-            **kwargs,
+            **kwargs,  # type: ignore[arg-type]
         )
-    def generate_embeddings(
+    def generate_embeddings(  # type: ignore[override]
         self,
         text_col: pd.Series,
         class_label_col: pd.Series | None = None,
@@ -65,10 +65,10 @@ class EmbeddingGeneratorForNLPSequenceClassification(NLPEmbeddingGenerator):
         if class_label_col is not None:
             if not isinstance(class_label_col, pd.Series):
                 raise TypeError("class_label_col must be a pandas Series")
-            df = pd.concat(
+            temp_df = pd.concat(
                 {"text": text_col, "class_label": class_label_col}, axis=1
             )
-            prepared_text_col = df.apply(
+            prepared_text_col = temp_df.apply(
                 lambda row: f" The classification label is {row['class_label']}. {row['text']}",
                 axis=1,
             )
@@ -83,8 +83,8 @@ class EmbeddingGeneratorForNLPSequenceClassification(NLPEmbeddingGenerator):
             batched=True,
             batch_size=self.batch_size,
         )
-        df: pd.DataFrame = ds.to_pandas()
-        return df["embedding_vector"]
+        result_df: pd.DataFrame = ds.to_pandas()
+        return result_df["embedding_vector"]
 class EmbeddingGeneratorForNLPSummarization(NLPEmbeddingGenerator):
@@ -104,10 +104,10 @@ class EmbeddingGeneratorForNLPSummarization(NLPEmbeddingGenerator):
         super().__init__(
             use_case=UseCases.NLP.SUMMARIZATION,
             model_name=model_name,
-            **kwargs,
+            **kwargs,  # type: ignore[arg-type]
         )
-    def generate_embeddings(
+    def generate_embeddings(  # type: ignore[override]
         self,
         text_col: pd.Series,
     ) -> pd.Series:

arize 8.0.0b2__py3-none-any.whl → 8.0.1__py3-none-any.whl

arize 8.0.0b2py3-none-any.whl → 8.0.1py3-none-any.whl