PyPI - arize - Versions diffs - 8.0.0a10__py3-none-any.whl → 8.0.0a11__py3-none-any.whl - Mend

arize 8.0.0a10py3-none-any.whl → 8.0.0a11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

arize/client.py +3 -1
arize/embeddings/__init__.py +4 -0
arize/embeddings/auto_generator.py +108 -0
arize/embeddings/base_generators.py +255 -0
arize/embeddings/constants.py +34 -0
arize/embeddings/cv_generators.py +28 -0
arize/embeddings/errors.py +41 -0
arize/embeddings/nlp_generators.py +111 -0
arize/embeddings/tabular_generators.py +161 -0
arize/embeddings/usecases.py +26 -0
arize/utils/online_tasks/__init__.py +5 -0
arize/utils/online_tasks/dataframe_preprocessor.py +235 -0
arize/version.py +1 -1
{arize-8.0.0a10.dist-info → arize-8.0.0a11.dist-info}/METADATA +41 -1
{arize-8.0.0a10.dist-info → arize-8.0.0a11.dist-info}/RECORD +17 -6
{arize-8.0.0a10.dist-info → arize-8.0.0a11.dist-info}/WHEEL +0 -0
{arize-8.0.0a10.dist-info → arize-8.0.0a11.dist-info}/licenses/LICENSE.md +0 -0

arize/client.py CHANGED Viewed

@@ -12,11 +12,13 @@ if TYPE_CHECKING:
     from arize.spans.client import SpansClient
+# TODO(Kiko): experimental/datasets must be adapted into the datasets subclient
+# TODO(Kiko): experimental/prompt hub is missing
+# TODO(Kiko): exporter/utils/schema_parser is missing
 # TODO(Kiko): Go through main APIs and add CtxAdapter where missing
 # TODO(Kiko): Search and handle other TODOs
 # TODO(Kiko): Go over **every file** and do not import anything at runtime, use `if TYPE_CHECKING`
 # with `from __future__ import annotations` (must include for Python < 3.11)
-# TODO(Kiko): MIMIC Explainer not done
 # TODO(Kiko): Go over docstrings
 class ArizeClient(LazySubclientsMixin):
     """

arize/embeddings/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from arize.embeddings.auto_generator import EmbeddingGenerator
+from arize.embeddings.usecases import UseCases
+__all__ = ["EmbeddingGenerator", "UseCases"]

arize/embeddings/auto_generator.py ADDED Viewed

@@ -0,0 +1,108 @@
+from typing import Any
+import pandas as pd
+from arize.embeddings import constants
+from arize.embeddings.base_generators import BaseEmbeddingGenerator
+from arize.embeddings.constants import (
+    CV_PRETRAINED_MODELS,
+    DEFAULT_CV_IMAGE_CLASSIFICATION_MODEL,
+    DEFAULT_CV_OBJECT_DETECTION_MODEL,
+    DEFAULT_NLP_SEQUENCE_CLASSIFICATION_MODEL,
+    DEFAULT_NLP_SUMMARIZATION_MODEL,
+    DEFAULT_TABULAR_MODEL,
+    NLP_PRETRAINED_MODELS,
+)
+from arize.embeddings.cv_generators import (
+    EmbeddingGeneratorForCVImageClassification,
+    EmbeddingGeneratorForCVObjectDetection,
+)
+from arize.embeddings.nlp_generators import (
+    EmbeddingGeneratorForNLPSequenceClassification,
+    EmbeddingGeneratorForNLPSummarization,
+)
+from arize.embeddings.tabular_generators import (
+    EmbeddingGeneratorForTabularFeatures,
+)
+from arize.embeddings.usecases import UseCases
+UseCaseLike = str | UseCases.NLP | UseCases.CV | UseCases.STRUCTURED
+class EmbeddingGenerator:
+    def __init__(self, **kwargs: str):
+        raise OSError(
+            f"{self.__class__.__name__} is designed to be instantiated using the "
+            f"`{self.__class__.__name__}.from_use_case(use_case, **kwargs)` method."
+        )
+    @staticmethod
+    def from_use_case(
+        use_case: UseCaseLike, **kwargs: Any
+    ) -> BaseEmbeddingGenerator:
+        if use_case == UseCases.NLP.SEQUENCE_CLASSIFICATION:
+            return EmbeddingGeneratorForNLPSequenceClassification(**kwargs)
+        elif use_case == UseCases.NLP.SUMMARIZATION:
+            return EmbeddingGeneratorForNLPSummarization(**kwargs)
+        elif use_case == UseCases.CV.IMAGE_CLASSIFICATION:
+            return EmbeddingGeneratorForCVImageClassification(**kwargs)
+        elif use_case == UseCases.CV.OBJECT_DETECTION:
+            return EmbeddingGeneratorForCVObjectDetection(**kwargs)
+        elif use_case == UseCases.STRUCTURED.TABULAR_EMBEDDINGS:
+            return EmbeddingGeneratorForTabularFeatures(**kwargs)
+        else:
+            raise ValueError(f"Invalid use case {use_case}")
+    @classmethod
+    def list_default_models(cls) -> pd.DataFrame:
+        df = pd.DataFrame(
+            {
+                "Area": ["NLP", "NLP", "CV", "CV", "STRUCTURED"],
+                "Usecase": [
+                    UseCases.NLP.SEQUENCE_CLASSIFICATION.name,
+                    UseCases.NLP.SUMMARIZATION.name,
+                    UseCases.CV.IMAGE_CLASSIFICATION.name,
+                    UseCases.CV.OBJECT_DETECTION.name,
+                    UseCases.STRUCTURED.TABULAR_EMBEDDINGS.name,
+                ],
+                "Model Name": [
+                    DEFAULT_NLP_SEQUENCE_CLASSIFICATION_MODEL,
+                    DEFAULT_NLP_SUMMARIZATION_MODEL,
+                    DEFAULT_CV_IMAGE_CLASSIFICATION_MODEL,
+                    DEFAULT_CV_OBJECT_DETECTION_MODEL,
+                    DEFAULT_TABULAR_MODEL,
+                ],
+            }
+        )
+        df.sort_values(
+            by=[col for col in df.columns], ascending=True, inplace=True
+        )
+        return df.reset_index(drop=True)
+    @classmethod
+    def list_pretrained_models(cls) -> pd.DataFrame:
+        data = {
+            "Task": ["NLP" for _ in NLP_PRETRAINED_MODELS]
+            + ["CV" for _ in CV_PRETRAINED_MODELS],
+            "Architecture": [
+                cls.__parse_model_arch(model)
+                for model in NLP_PRETRAINED_MODELS + CV_PRETRAINED_MODELS
+            ],
+            "Model Name": NLP_PRETRAINED_MODELS + CV_PRETRAINED_MODELS,
+        }
+        df = pd.DataFrame(data)
+        df.sort_values(
+            by=[col for col in df.columns], ascending=True, inplace=True
+        )
+        return df.reset_index(drop=True)
+    @staticmethod
+    def __parse_model_arch(model_name: str) -> str:
+        if constants.GPT.lower() in model_name.lower():
+            return constants.GPT
+        elif constants.BERT.lower() in model_name.lower():
+            return constants.BERT
+        elif constants.VIT.lower() in model_name.lower():
+            return constants.VIT
+        else:
+            raise ValueError("Invalid model_name, unknown architecture.")

arize/embeddings/base_generators.py ADDED Viewed

@@ -0,0 +1,255 @@
+import os
+from abc import ABC, abstractmethod
+from enum import Enum
+from functools import partial
+from typing import Dict, List, Union, cast
+import pandas as pd
+import arize.embeddings.errors as err
+from arize.embeddings.constants import IMPORT_ERROR_MESSAGE
+try:
+    import torch
+    from datasets import Dataset
+    from PIL import Image
+    from transformers import (  # type: ignore
+        AutoImageProcessor,
+        AutoModel,
+        AutoTokenizer,
+        BatchEncoding,
+    )
+    from transformers.utils import logging as transformer_logging
+except ImportError as e:
+    raise ImportError(IMPORT_ERROR_MESSAGE) from e
+import logging
+logger = logging.getLogger(__name__)
+transformer_logging.set_verbosity(50)
+transformer_logging.enable_progress_bar()
+class BaseEmbeddingGenerator(ABC):
+    def __init__(
+        self, use_case: Enum, model_name: str, batch_size: int = 100, **kwargs
+    ):
+        self.__use_case = self._parse_use_case(use_case=use_case)
+        self.__model_name = model_name
+        self.__device = self.select_device()
+        self.__batch_size = batch_size
+        logger.info(f"Downloading pre-trained model '{self.model_name}'")
+        try:
+            self.__model = AutoModel.from_pretrained(
+                self.model_name, **kwargs
+            ).to(self.device)
+        except OSError as e:
+            raise err.HuggingFaceRepositoryNotFound(model_name) from e
+        except Exception as e:
+            raise e
+    @abstractmethod
+    def generate_embeddings(self, **kwargs) -> pd.Series: ...
+    def select_device(self) -> torch.device:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            logger.warning(
+                "No available GPU has been detected. The use of GPU acceleration is "
+                "strongly recommended. You can check for GPU availability by running "
+                "`torch.cuda.is_available()` or `torch.backends.mps.is_available()`."
+            )
+            return torch.device("cpu")
+    @property
+    def use_case(self) -> str:
+        return self.__use_case
+    @property
+    def model_name(self) -> str:
+        return self.__model_name
+    @property
+    def model(self):
+        return self.__model
+    @property
+    def device(self) -> torch.device:
+        return self.__device
+    @property
+    def batch_size(self) -> int:
+        return self.__batch_size
+    @batch_size.setter
+    def batch_size(self, new_batch_size: int) -> None:
+        err_message = "New batch size should be an integer greater than 0."
+        if not isinstance(new_batch_size, int):
+            raise TypeError(err_message)
+        elif new_batch_size <= 0:
+            raise ValueError(err_message)
+        else:
+            self.__batch_size = new_batch_size
+            logger.info(f"Batch size has been set to {new_batch_size}.")
+    @staticmethod
+    def _parse_use_case(use_case: Enum) -> str:
+        uc_area = use_case.__class__.__name__.split("UseCases")[0]
+        uc_task = use_case.name
+        return f"{uc_area}.{uc_task}"
+    def _get_embedding_vector(
+        self, batch: Dict[str, torch.Tensor], method
+    ) -> Dict[str, torch.Tensor]:
+        with torch.no_grad():
+            outputs = self.model(**batch)
+        # (batch_size, seq_length/or/num_tokens, hidden_size)
+        if method == "cls_token":  # Select CLS token vector
+            embeddings = outputs.last_hidden_state[:, 0, :]
+        elif method == "avg_token":  # Select avg token vector
+            embeddings = torch.mean(outputs.last_hidden_state, 1)
+        else:
+            raise ValueError(f"Invalid method = {method}")
+        return {"embedding_vector": embeddings.cpu().numpy().astype(float)}
+    @staticmethod
+    def check_invalid_index(field: Union[pd.Series, pd.DataFrame]) -> None:
+        if (field.index != field.reset_index(drop=True).index).any():
+            if isinstance(field, pd.DataFrame):
+                raise err.InvalidIndexError("DataFrame")
+            else:
+                raise err.InvalidIndexError(str(field.name))
+    @abstractmethod
+    def __repr__(self) -> str:
+        pass
+class NLPEmbeddingGenerator(BaseEmbeddingGenerator):
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}(\n"
+            f"  use_case={self.use_case},\n"
+            f"  model_name='{self.model_name}',\n"
+            f"  tokenizer_max_length={self.tokenizer_max_length},\n"
+            f"  tokenizer={self.tokenizer.__class__},\n"
+            f"  model={self.model.__class__},\n"
+            f"  batch_size={self.batch_size},\n"
+            f")"
+        )
+    def __init__(
+        self,
+        use_case: Enum,
+        model_name: str,
+        tokenizer_max_length: int = 512,
+        **kwargs,
+    ):
+        super().__init__(use_case=use_case, model_name=model_name, **kwargs)
+        self.__tokenizer_max_length = tokenizer_max_length
+        # We don't check for the tokenizer's existence since it is coupled with the corresponding model
+        # We check the model's existence in `BaseEmbeddingGenerator`
+        logger.info(f"Downloading tokenizer for '{self.model_name}'")
+        self.__tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name, model_max_length=self.tokenizer_max_length
+        )
+    @property
+    def tokenizer(self):
+        return self.__tokenizer
+    @property
+    def tokenizer_max_length(self) -> int:
+        return self.__tokenizer_max_length
+    def tokenize(
+        self, batch: Dict[str, List[str]], text_feat_name: str
+    ) -> BatchEncoding:
+        return self.tokenizer(
+            batch[text_feat_name],
+            padding=True,
+            truncation=True,
+            max_length=self.tokenizer_max_length,
+            return_tensors="pt",
+        ).to(self.device)
+class CVEmbeddingGenerator(BaseEmbeddingGenerator):
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}(\n"
+            f"  use_case={self.use_case},\n"
+            f"  model_name='{self.model_name}',\n"
+            f"  image_processor={self.image_processor.__class__},\n"
+            f"  model={self.model.__class__},\n"
+            f"  batch_size={self.batch_size},\n"
+            f")"
+        )
+    def __init__(self, use_case: Enum, model_name: str, **kwargs):
+        super().__init__(use_case=use_case, model_name=model_name, **kwargs)
+        logger.info("Downloading image processor")
+        # We don't check for the image processor's existence since it is coupled with the corresponding model
+        # We check the model's existence in `BaseEmbeddingGenerator`
+        self.__image_processor = AutoImageProcessor.from_pretrained(
+            self.model_name
+        )
+    @property
+    def image_processor(self):
+        return self.__image_processor
+    @staticmethod
+    def open_image(image_path: str) -> Image.Image:
+        if not os.path.exists(image_path):
+            raise ValueError(f"Cannot find image {image_path}")
+        return Image.open(image_path).convert("RGB")
+    def preprocess_image(
+        self, batch: Dict[str, List[str]], local_image_feat_name: str
+    ):
+        return self.image_processor(
+            [
+                self.open_image(image_path)
+                for image_path in batch[local_image_feat_name]
+            ],
+            return_tensors="pt",
+        ).to(self.device)
+    def generate_embeddings(self, local_image_path_col: pd.Series) -> pd.Series:
+        """
+        Obtain embedding vectors from your image data using pre-trained image models.
+        :param local_image_path_col: a pandas Series containing the local path to the images to
+        be used to generate the embedding vectors.
+        :return: a pandas Series containing the embedding vectors.
+        """
+        if not isinstance(local_image_path_col, pd.Series):
+            raise TypeError(
+                "local_image_path_col_name must be pandas Series object"
+            )
+        self.check_invalid_index(field=local_image_path_col)
+        # Validate that there are no null image paths
+        if local_image_path_col.isnull().any():
+            raise ValueError(
+                "There can't be any null values in the local_image_path_col series"
+            )
+        ds = Dataset.from_dict({"local_path": local_image_path_col})
+        ds.set_transform(
+            partial(
+                self.preprocess_image,
+                local_image_feat_name="local_path",
+            )
+        )
+        logger.info("Generating embedding vectors")
+        ds = ds.map(
+            lambda batch: self._get_embedding_vector(batch, "avg_token"),
+            batched=True,
+            batch_size=self.batch_size,
+        )
+        return cast(pd.DataFrame, ds.to_pandas())["embedding_vector"]

arize/embeddings/constants.py ADDED Viewed

@@ -0,0 +1,34 @@
+DEFAULT_NLP_SEQUENCE_CLASSIFICATION_MODEL = "distilbert-base-uncased"
+DEFAULT_NLP_SUMMARIZATION_MODEL = "distilbert-base-uncased"
+DEFAULT_TABULAR_MODEL = "distilbert-base-uncased"
+DEFAULT_CV_IMAGE_CLASSIFICATION_MODEL = "google/vit-base-patch32-224-in21k"
+DEFAULT_CV_OBJECT_DETECTION_MODEL = "facebook/detr-resnet-101"
+NLP_PRETRAINED_MODELS = [
+    "bert-base-cased",
+    "bert-base-uncased",
+    "bert-large-cased",
+    "bert-large-uncased",
+    "distilbert-base-cased",
+    "distilbert-base-uncased",
+    "xlm-roberta-base",
+    "xlm-roberta-large",
+]
+CV_PRETRAINED_MODELS = [
+    "google/vit-base-patch16-224-in21k",
+    "google/vit-base-patch16-384",
+    "google/vit-base-patch32-224-in21k",
+    "google/vit-base-patch32-384",
+    "google/vit-large-patch16-224-in21k",
+    "google/vit-large-patch16-384",
+    "google/vit-large-patch32-224-in21k",
+    "google/vit-large-patch32-384",
+]
+IMPORT_ERROR_MESSAGE = (
+    "To enable embedding generation, the arize module must be installed with "
+    "extra dependencies. Run: pip install 'arize[auto-embeddings]'."
+)
+GPT = "GPT"
+BERT = "BERT"
+VIT = "ViT"

arize/embeddings/cv_generators.py ADDED Viewed

@@ -0,0 +1,28 @@
+from arize.embeddings.base_generators import CVEmbeddingGenerator
+from arize.embeddings.constants import (
+    DEFAULT_CV_IMAGE_CLASSIFICATION_MODEL,
+    DEFAULT_CV_OBJECT_DETECTION_MODEL,
+)
+from arize.embeddings.usecases import UseCases
+class EmbeddingGeneratorForCVImageClassification(CVEmbeddingGenerator):
+    def __init__(
+        self, model_name: str = DEFAULT_CV_IMAGE_CLASSIFICATION_MODEL, **kwargs
+    ):
+        super().__init__(
+            use_case=UseCases.CV.IMAGE_CLASSIFICATION,
+            model_name=model_name,
+            **kwargs,
+        )
+class EmbeddingGeneratorForCVObjectDetection(CVEmbeddingGenerator):
+    def __init__(
+        self, model_name: str = DEFAULT_CV_OBJECT_DETECTION_MODEL, **kwargs
+    ):
+        super().__init__(
+            use_case=UseCases.CV.OBJECT_DETECTION,
+            model_name=model_name,
+            **kwargs,
+        )

arize/embeddings/errors.py ADDED Viewed

@@ -0,0 +1,41 @@
+class InvalidIndexError(Exception):
+    def __repr__(self) -> str:
+        return "Invalid_Index_Error"
+    def __str__(self) -> str:
+        return self.error_message()
+    def __init__(self, field_name: str) -> None:
+        self.field_name = field_name
+    def error_message(self) -> str:
+        if self.field_name == "DataFrame":
+            return (
+                f"The index of the {self.field_name} is invalid; "
+                f"reset the index by using df.reset_index(drop=True, inplace=True)"
+            )
+        else:
+            return (
+                f"The index of the Series given by the column '{self.field_name}' is invalid; "
+                f"reset the index by using df.reset_index(drop=True, inplace=True)"
+            )
+class HuggingFaceRepositoryNotFound(Exception):
+    def __repr__(self) -> str:
+        return "HuggingFace_Repository_Not_Found_Error"
+    def __str__(self) -> str:
+        return self.error_message()
+    def __init__(self, model_name: str) -> None:
+        self.model_name = model_name
+    def error_message(self) -> str:
+        return (
+            f"The given model name '{self.model_name}' is not a valid model identifier listed on "
+            "'https://huggingface.co/models'. "
+            "If this is a private repository, log in with `huggingface-cli login` or importing "
+            "`login` from `huggingface_hub` if you are using a notebook. "
+            "Learn more in https://huggingface.co/docs/huggingface_hub/quick-start#login"
+        )

arize/embeddings/nlp_generators.py ADDED Viewed

@@ -0,0 +1,111 @@
+import logging
+from functools import partial
+from typing import Optional, cast
+import pandas as pd
+from arize.embeddings.base_generators import NLPEmbeddingGenerator
+from arize.embeddings.constants import (
+    DEFAULT_NLP_SEQUENCE_CLASSIFICATION_MODEL,
+    DEFAULT_NLP_SUMMARIZATION_MODEL,
+    IMPORT_ERROR_MESSAGE,
+)
+from arize.embeddings.usecases import UseCases
+try:
+    from datasets import Dataset
+except ImportError:
+    raise ImportError(IMPORT_ERROR_MESSAGE) from None
+logger = logging.getLogger(__name__)
+class EmbeddingGeneratorForNLPSequenceClassification(NLPEmbeddingGenerator):
+    def __init__(
+        self,
+        model_name: str = DEFAULT_NLP_SEQUENCE_CLASSIFICATION_MODEL,
+        **kwargs,
+    ):
+        super().__init__(
+            use_case=UseCases.NLP.SEQUENCE_CLASSIFICATION,
+            model_name=model_name,
+            **kwargs,
+        )
+    def generate_embeddings(
+        self,
+        text_col: pd.Series,
+        class_label_col: Optional[pd.Series] = None,
+    ) -> pd.Series:
+        """
+        Obtain embedding vectors from your text data using pre-trained large language models.
+        :param text_col: a pandas Series containing the different pieces of text.
+        :param class_label_col: if this column is passed, the sentence "The classification label
+        is <class_label>" will be appended to the text in the `text_col`.
+        :return: a pandas Series containing the embedding vectors.
+        """
+        if not isinstance(text_col, pd.Series):
+            raise TypeError("text_col must be a pandas Series")
+        self.check_invalid_index(field=text_col)
+        if class_label_col is not None:
+            if not isinstance(class_label_col, pd.Series):
+                raise TypeError("class_label_col must be a pandas Series")
+            df = pd.concat(
+                {"text": text_col, "class_label": class_label_col}, axis=1
+            )
+            prepared_text_col = df.apply(
+                lambda row: f" The classification label is {row['class_label']}. {row['text']}",
+                axis=1,
+            )
+            ds = Dataset.from_dict({"text": prepared_text_col})
+        else:
+            ds = Dataset.from_dict({"text": text_col})
+        ds.set_transform(partial(self.tokenize, text_feat_name="text"))
+        logger.info("Generating embedding vectors")
+        ds = ds.map(
+            lambda batch: self._get_embedding_vector(batch, "cls_token"),
+            batched=True,
+            batch_size=self.batch_size,
+        )
+        return cast(pd.DataFrame, ds.to_pandas())["embedding_vector"]
+class EmbeddingGeneratorForNLPSummarization(NLPEmbeddingGenerator):
+    def __init__(
+        self, model_name: str = DEFAULT_NLP_SUMMARIZATION_MODEL, **kwargs
+    ):
+        super().__init__(
+            use_case=UseCases.NLP.SUMMARIZATION,
+            model_name=model_name,
+            **kwargs,
+        )
+    def generate_embeddings(
+        self,
+        text_col: pd.Series,
+    ) -> pd.Series:
+        """
+        Obtain embedding vectors from your text data using pre-trained large language models.
+        :param text_col: a pandas Series containing the different pieces of text.
+        :return: a pandas Series containing the embedding vectors.
+        """
+        if not isinstance(text_col, pd.Series):
+            raise TypeError("text_col must be a pandas Series")
+        self.check_invalid_index(field=text_col)
+        ds = Dataset.from_dict({"text": text_col})
+        ds.set_transform(partial(self.tokenize, text_feat_name="text"))
+        logger.info("Generating embedding vectors")
+        ds = ds.map(
+            lambda batch: self._get_embedding_vector(batch, "cls_token"),
+            batched=True,
+            batch_size=self.batch_size,
+        )
+        return cast(pd.DataFrame, ds.to_pandas())["embedding_vector"]

arize/embeddings/tabular_generators.py ADDED Viewed

@@ -0,0 +1,161 @@
+import logging
+from functools import partial
+from typing import Dict, List, Optional, Tuple, Union, cast
+import pandas as pd
+from arize.embeddings.base_generators import NLPEmbeddingGenerator
+from arize.embeddings.constants import (
+    DEFAULT_TABULAR_MODEL,
+    IMPORT_ERROR_MESSAGE,
+)
+from arize.embeddings.usecases import UseCases
+from arize.types import is_list_of
+try:
+    from datasets import Dataset
+except ImportError:
+    raise ImportError(IMPORT_ERROR_MESSAGE) from None
+logger = logging.getLogger(__name__)
+TABULAR_PRETRAINED_MODELS = [
+    "bert-base-uncased",
+    "distilbert-base-uncased",
+    "xlm-roberta-base",
+]
+class EmbeddingGeneratorForTabularFeatures(NLPEmbeddingGenerator):
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}(\n"
+            f"  use_case={self.use_case},\n"
+            f"  model_name={self.model_name},\n"
+            f"  tokenizer_max_length={self.tokenizer_max_length},\n"
+            f"  tokenizer={self.tokenizer.__class__},\n"
+            f"  model={self.model.__class__},\n"
+            f")"
+        )
+    def __init__(
+        self,
+        model_name: str = DEFAULT_TABULAR_MODEL,
+        **kwargs,
+    ):
+        if model_name not in TABULAR_PRETRAINED_MODELS:
+            raise ValueError(
+                "model_name not supported. Check supported models with "
+                "`EmbeddingGeneratorForTabularFeatures.list_pretrained_models()`"
+            )
+        super().__init__(
+            use_case=UseCases.STRUCTURED.TABULAR_EMBEDDINGS,
+            model_name=model_name,
+            **kwargs,
+        )
+    def generate_embeddings(
+        self,
+        df: pd.DataFrame,
+        selected_columns: List[str],
+        col_name_map: Optional[Dict[str, str]] = None,
+        return_prompt_col: bool = False,
+    ) -> Union[pd.Series, Tuple[pd.Series, pd.Series]]:
+        """
+        Obtain embedding vectors from your tabular data. Prompts are generated from your
+        `selected_columns` and passed to a pre-trained large language model for embedding vector
+        computation.
+        :param df: pandas DataFrame containing the tabular data, not all columns will be
+        considered, see `selected_columns`.
+        :param selected_columns: columns to be considered to construct the prompt to be passed to
+        the LLM.
+        :param col_name_map: mapping between selected column names and a more verbose description of
+        the name. This helps the LLM understand the features better.
+        :param return_prompt_col: if set to True, an extra pandas Series will be returned
+        containing the constructed prompts. Defaults to False.
+        :return: a pandas Series containing the embedding vectors and, if `return_prompt_col` is
+        set to True, a pandas Seres containing the prompts created from tabular features.
+        """
+        if col_name_map is None:
+            col_name_map = {}
+        if not isinstance(df, pd.DataFrame):
+            raise TypeError("df must be a pandas DataFrame")
+        self.check_invalid_index(field=df)
+        if not is_list_of(selected_columns, str):
+            raise TypeError("columns must be a list of column names (strings)")
+        missing_cols = set(selected_columns).difference(df.columns)
+        if missing_cols:
+            raise ValueError(
+                "selected_columns list must only contain columns of the dataframe. "
+                f"The following columns are not found {missing_cols}"
+            )
+        if not isinstance(col_name_map, dict):
+            raise TypeError(
+                "col_name_map must be a dictionary mapping column names to new column "
+                "names"
+            )
+        for k, v in col_name_map.items():
+            if not isinstance(k, str) or not isinstance(v, str):
+                raise ValueError(
+                    "col_name_map dictionary keys and values should be strings"
+                )
+        missing_cols = set(col_name_map.keys()).difference(df.columns)
+        if missing_cols:
+            raise ValueError(
+                "col_name_map must only contain keys which are columns of the dataframe. "
+                f"The following columns are not found {missing_cols}"
+            )
+        prompts = df.rename(columns=col_name_map).apply(
+            partial(
+                self.__prompt_fn,
+                columns=[
+                    col_name_map.get(col, col) for col in selected_columns
+                ],
+            ),
+            axis=1,
+        )
+        ds = Dataset.from_dict({"prompt": prompts})
+        ds.set_transform(partial(self.tokenize, text_feat_name="prompt"))
+        logger.info("Generating embedding vectors")
+        ds = ds.map(
+            lambda batch: self._get_embedding_vector(
+                batch, self.__get_method_for_embedding_calculation()
+            ),
+            batched=True,
+            batch_size=self.batch_size,
+        )
+        if return_prompt_col:
+            return (
+                cast(pd.DataFrame, ds.to_pandas())["embedding_vector"],
+                cast(pd.Series, prompts),
+            )
+        return cast(pd.DataFrame, ds.to_pandas())["embedding_vector"]
+    @staticmethod
+    def __prompt_fn(row: pd.DataFrame, columns: List[str]) -> str:
+        return " ".join(
+            f"The {col.replace('_', ' ')} is {str(row[col]).strip()}."
+            for col in columns
+        )
+    def __get_method_for_embedding_calculation(self):
+        try:
+            return {
+                "bert-base-uncased": "avg_token",
+                "distilbert-base-uncased": "avg_token",
+                "xlm-roberta-base": "cls_token",
+            }[self.model_name]
+        except Exception as exc:
+            raise ValueError(
+                f"Unsupported model_name {self.model_name}"
+            ) from exc
+    @staticmethod
+    def list_pretrained_models() -> pd.DataFrame:
+        return pd.DataFrame({"Model Name": sorted(TABULAR_PRETRAINED_MODELS)})

arize/embeddings/usecases.py ADDED Viewed

@@ -0,0 +1,26 @@
+from dataclasses import dataclass
+from enum import Enum, auto, unique
+@unique
+class NLPUseCases(Enum):
+    SEQUENCE_CLASSIFICATION = auto()
+    SUMMARIZATION = auto()
+@unique
+class CVUseCases(Enum):
+    IMAGE_CLASSIFICATION = auto()
+    OBJECT_DETECTION = auto()
+@unique
+class TabularUsecases(Enum):
+    TABULAR_EMBEDDINGS = auto()
+@dataclass
+class UseCases:
+    NLP = NLPUseCases
+    CV = CVUseCases
+    STRUCTURED = TabularUsecases

arize/utils/online_tasks/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from arize.utils.online_tasks.dataframe_preprocessor import (
+    extract_nested_data_to_column,
+)
+__all__ = ["extract_nested_data_to_column"]

arize/utils/online_tasks/dataframe_preprocessor.py ADDED Viewed

@@ -0,0 +1,235 @@
+import json
+import logging
+from typing import Any, List, Tuple
+import numpy as np
+import pandas as pd
+logger = logging.getLogger(__name__)
+def extract_nested_data_to_column(
+    attributes: List[str], df: pd.DataFrame
+) -> pd.DataFrame:
+    """
+    This function, used in Online Tasks, is typically run on data exported from Arize.
+    It prepares the DataFrame by extracting relevant attributes from complex, deeply nested
+    data structures, such as those found in LLM outputs or JSON-like records. It helps extract
+    specific values from these nested structures by identifying the longest matching column name
+    in the DataFrame and recursively accessing the desired attribute path within each row.
+    This preprocessing step ensures that the extracted values are available as new columns,
+    allowing evaluators to process and assess these values effectively.
+    For each attributes string in `attributes` (e.g. "attributes.llm.output_messages.0.message.content"),
+    1) Find the largest prefix that is actually a column name in `df`. (e.g. "attributes.llm.output_messages")
+    2) Use the remainder of the attribute as the introspect path for the values in that column:
+        - Calls _introspect_arize_attribute({row_value}, {attribute_remainder}) for each row value
+            e.g. {row_value} = [{'message.role': 'assistant',
+                                'message.content': 'The capital of China is Beijing.'}]
+            e.g. {attribute_remainder} = "0.message.content"
+        - This introspect function recursively indexes into a given row_value based on
+            the attribute_remainder path and is able to handle a variety of nested structures
+            such as the example given for {row_value}
+    3) Create a new column named exactly `attribute`, filling it row-by-row with the result
+       of introspecting into the column's value. (e.g. row extracted: 'The capital of China is Beijing.')
+       If introspect fails or yields None, store NaN.
+    4) After all columns have been created, drop rows that have NaN in *any* of the newly-created columns.
+    5) Log how many rows were dropped and, if zero rows remain, log a message indicating that
+       there are no rows satisfying *all* of the queries.
+    """
+    # Make a copy so as not to alter the input df
+    result_df = df.copy()
+    # Keep track of which new columns we add. Each column name will match each user-inputted attribute
+    # (e.g. "attributes.llm.output_messages.0.message.content")
+    new_cols: List[str] = []
+    for attribute in attributes:
+        parts = attribute.split(".")
+        prefix_col = None
+        prefix_len = 0
+        # 1) Find largest prefix of attribute that matches a column in df
+        for i in range(1, len(parts) + 1):
+            candidate = ".".join(parts[:i])
+            if candidate in result_df.columns:
+                prefix_col = candidate
+                prefix_len = i
+        if prefix_col is None:
+            raise Exception("No such column found in DataFrame.")
+        # 2) The remainder after the prefix
+        remainder = ".".join(parts[prefix_len:])
+        # 3) Apply introspect row-by-row
+        def apply_introspect_arize_attribute(
+            row: pd.Series,
+            prefix_col: str = prefix_col,
+            remainder: str = remainder,
+        ) -> Any:
+            val = row[prefix_col]
+            try:
+                result = _introspect_arize_attribute(val, remainder)
+                return result if result is not None else np.nan
+            except Exception:
+                return np.nan
+        result_df[attribute] = result_df.apply(
+            apply_introspect_arize_attribute, axis=1
+        )
+        new_cols.append(attribute)
+    # 4) Drop rows that are NaN in *any* of the newly-added columns
+    rows_before = len(df)
+    result_df = result_df.dropna(subset=new_cols)
+    rows_after = len(result_df)
+    rows_dropped = rows_before - rows_after
+    # 5) Log some diagnostics
+    logger.info(f"Rows before processing: {rows_before}")
+    logger.info(f"Rows after processing: {rows_after}")
+    logger.info(f"Rows dropped: {rows_dropped}")
+    if rows_after == 0:
+        logger.info(
+            f"For the given filter, there are no rows that have ALL of the following variables: {attributes}"
+        )
+    return result_df
+def _introspect_arize_attribute(value: Any, attribute: str) -> Any:
+    """
+    Recursively drill into `value` following the dot-delimited `attribute`.
+    Example:
+        value: [{'message.role': 'assistant', 'message.content': 'The capital of China is Beijing.'}]
+        attribute: "0.message.content"
+        Returns: 'The capital of China is Beijing.'
+      - Returns None immediately when a key or index is not found
+      - Handles integer parts for lists
+      - Parses JSON strings
+      - Converts NumPy arrays to lists
+      - Allows dotted keys (e.g. "message.content") by combining parts
+    """
+    if not attribute:
+        return value
+    attribute_parts = attribute.split(".")
+    return _introspect_arize_attribute_parts(value, attribute_parts)
+def _introspect_arize_attribute_parts(
+    current_value: Any, attribute_parts_unprocessed: List[str]
+) -> Any:
+    # If no more parts, we return whatever we have
+    if not attribute_parts_unprocessed:
+        return current_value
+    current_value = _ensure_deserialized(current_value)
+    # Parse out the next value using the first (or combined) part(s).
+    parsed_value, num_parts_processed = _parse_value(
+        current_value, attribute_parts_unprocessed
+    )
+    # If we can't find a match, immediately return None
+    if parsed_value is None:
+        return None
+    # Otherwise, recurse deeper with the leftover parts
+    return _introspect_arize_attribute_parts(
+        parsed_value, attribute_parts_unprocessed[num_parts_processed:]
+    )
+def _parse_value(
+    current_value: Any, attribute_parts_unprocessed: List[str]
+) -> Tuple[Any, int]:
+    """
+    Attempt to parse out the next value from `current_value` using the earliest parts:
+    1) If `attribute_parts_unprocessed[0]` is an integer index and `current_value` is a list/tuple,
+       index into it.
+    2) Else if `current_value` is a dict, check if `attribute_parts_unprocessed[0]` is a key.
+       If not found, try combining `attribute_parts_unprocessed[0] + '.' + attribute_parts_unprocessed[1]`...
+       to handle dotted keys in the dict.
+    3) If none match, return (None, 1) to signal "not found, consume 1 part."
+    Returns (parsed_value, num_parts_processed):
+      - parsed_value: the found value or None if not found
+      - num_parts_processed: how many parts were processed (1 or more)
+    """
+    if not attribute_parts_unprocessed:
+        return (None, 0)
+    key = attribute_parts_unprocessed[
+        0
+    ]  # If key is an int, then it represents a list index
+    num_parts_processed = (
+        1  # By default, we're at least consuming this first part
+    )
+    # 1) Try integer index (e.g. "0" => 0)
+    idx = _try_int(key)
+    if idx is not None:
+        # Must be a tuple or list (_ensure_deserialized() already casts numpy arrays to python lists)
+        if isinstance(current_value, (list, tuple)):
+            if 0 <= idx < len(current_value):
+                return (current_value[idx], num_parts_processed)
+            else:
+                return (None, num_parts_processed)
+        else:
+            return (None, num_parts_processed)
+    # 2) Try dict approach
+    if isinstance(current_value, dict):
+        # a) direct match
+        if key in current_value:
+            return (current_value[key], num_parts_processed)
+        else:
+            # b) try combining multiple parts to handle dotted key
+            for num_parts_processed in range(
+                1, len(attribute_parts_unprocessed)
+            ):
+                key += "." + attribute_parts_unprocessed[num_parts_processed]
+                if key in current_value:
+                    return (
+                        current_value[key],
+                        num_parts_processed + 1,
+                    )
+            return (None, num_parts_processed)
+    # If we get here, we couldn't handle it (not a list or dict or mismatch)
+    return (None, num_parts_processed)
+def _ensure_deserialized(val: Any) -> Any:
+    """
+    1) If `val` is a numpy array, convert to a Python list.
+    2) If `val` is a string, attempt to parse as JSON.
+    3) Otherwise return as-is.
+    """
+    if isinstance(val, np.ndarray):
+        val = val.tolist()
+    if isinstance(val, str):
+        try:
+            return json.loads(val)
+        except (json.JSONDecodeError, TypeError, ValueError):
+            pass
+    return val
+def _try_int(s: str) -> int | None:
+    """Attempt to convert s to int, return None on failure."""
+    try:
+        return int(s)
+    except ValueError:
+        return None

arize/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "8.0.~~0a10~~"
1	+ __version__ = "8.0.0a11"

{arize-8.0.0a10.dist-info → arize-8.0.0a11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arize
-Version: 8.0.0a10
+Version: 8.0.0a11
 Summary: A helper library to interact with Arize AI APIs
 Project-URL: Homepage, https://arize.com
 Project-URL: Documentation, https://docs.arize.com/arize
@@ -27,6 +27,13 @@ Classifier: Topic :: System :: Monitoring
 Requires-Python: >=3.10
 Requires-Dist: lazy-imports
 Requires-Dist: numpy>=2.0.0
+Provides-Extra: auto-embeddings
+Requires-Dist: datasets!=2.14.*,<3,>=2.8; extra == 'auto-embeddings'
+Requires-Dist: pandas<3,>=1.0.0; extra == 'auto-embeddings'
+Requires-Dist: pillow<11,>=8.4.0; extra == 'auto-embeddings'
+Requires-Dist: tokenizers<1,>=0.13; extra == 'auto-embeddings'
+Requires-Dist: torch<3,>=1.13; extra == 'auto-embeddings'
+Requires-Dist: transformers<5,>=4.25; extra == 'auto-embeddings'
 Provides-Extra: dev
 Requires-Dist: pytest==8.4.2; extra == 'dev'
 Requires-Dist: ruff==0.13.2; extra == 'dev'
@@ -84,6 +91,7 @@ Description-Content-Type: text/markdown
     - [Stream log ML Data for a Classification use-case](#stream-log-ml-data-for-a-classification-use-case)
     - [Log a batch of ML Data for a Object Detection use-case](#log-a-batch-of-ml-data-for-a-object-detection-use-case)
     - [Exporting ML Data](#exporting-ml-data)
+  - [Generate embeddings for your data](#generate-embeddings-for-your-data)
 - [Community](#community)
 # Overview
@@ -326,6 +334,38 @@ df = client.models.export_to_df(
 )
 ```
+## Generate embeddings for your data
+```python
+import pandas as pd
+from arize.embeddings import EmbeddingGenerator, UseCases
+# You can check available models
+print(EmbeddingGenerator.list_pretrained_models())
+# Example dataframe
+df = pd.DataFrame(
+    {
+        "text": [
+            "Hello world.",
+            "Artificial Intelligence is the future.",
+            "Spain won the FIFA World Cup on 2010.",
+        ],
+    }
+)
+# Instantiate the generator for your usecase, selecting the base model
+generator = EmbeddingGenerator.from_use_case(
+    use_case=UseCases.NLP.SEQUENCE_CLASSIFICATION,
+    model_name="distilbert-base-uncased",
+    tokenizer_max_length=512,
+    batch_size=100,
+)
+# Generate embeddings
+df["text_vector"] = generator.generate_embeddings(text_col=df["text"])
+```
 # Community
 Join our community to connect with thousands of AI builders.

{arize-8.0.0a10.dist-info → arize-8.0.0a11.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
 arize/__init__.py,sha256=-4bbbZwcjGS9OfAunsB-lmKRCzccPdFvZmvJQJEky3E,534
 arize/_lazy.py,sha256=MVep6D93sJWvArg4pgm4CVNGc6tu-XRK_Z7EDMuc76I,2358
-arize/client.py,sha256=0LtZU3WeEatGd1QgQsMrJOuI-tFmzM3y1AfO74BLJys,5716
+arize/client.py,sha256=kDdOWC1rwYgPPExO3wT3-KU3qpMwQ0ogrAdjvf7Ls3M,5860
 arize/config.py,sha256=iynVEZhrOPdTNJTQ_KQmwKOPiwL0LfEP8AUIDYW86Xw,5801
 arize/logging.py,sha256=2vwdta2-kR78GeBFGK2vpk51rQ2d06HoKzuARI9qFQk,7317
 arize/types.py,sha256=z1yg5-brmTD4kVHDmmTVkYke53JpusXXeOOpdQw7rYg,69508
-arize/version.py,sha256=Wv8B6KxzS2ThGtkzs_13OkvwSugf5HITHYMQsGk1gjg,25
+arize/version.py,sha256=YFPzyK5jfODAbvUqUQHeQ5WVmHl6zTh9HSFOA75S0rc,25
 arize/_exporter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arize/_exporter/client.py,sha256=eAxJX1sUfdpLrtaQ0ynMTd5jI37JOp9fbl3NWp4WFEA,15216
 arize/_exporter/validation.py,sha256=6ROu5p7uaolxQ93lO_Eiwv9NVw_uyi3E5T--C5Klo5Q,1021
@@ -59,6 +59,15 @@ arize/constants/model_mapping.json,sha256=OPE54rBATzmwRhx0tycsxnGae1jBhtqEmQqQvz
 arize/constants/spans.py,sha256=EfMgbEIK_2EUcvUY5BGnNAbS7bupBKePlI3j2L5T5CE,2532
 arize/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arize/datasets/client.py,sha256=Rk3TQF2IzJwi3JqF1GYt1tUs68gPIiVFRgKjEmY7igE,743
+arize/embeddings/__init__.py,sha256=6_C8908W_qDixkoBJl1wapgmQCzI8TPLH207kzbYsFA,156
+arize/embeddings/auto_generator.py,sha256=ukZUJWRkiG9HFgSHXhr44rt2tdVHn1phb7_nOxYXWEg,4111
+arize/embeddings/base_generators.py,sha256=HybEUAzeESswEDmkmvPayzFab1y8deg5X20HSphGp8Q,8855
+arize/embeddings/constants.py,sha256=77LEXcXr_MPGRVSE06-4opFGeYrtdMmosQX91yQu6p0,1104
+arize/embeddings/cv_generators.py,sha256=8eXwvP_kvAt8I9WA-0tRJd0XID4lFOydyTYfOMW_-xo,880
+arize/embeddings/errors.py,sha256=T8PTFELs-xs7GXDmx402T_-DCkCXkV1CxdKAc2jAM2s,1517
+arize/embeddings/nlp_generators.py,sha256=AVUpr95nQChVGAUiruCoME8tcrh79PaRrbKI7H1gGBE,3843
+arize/embeddings/tabular_generators.py,sha256=lj2wVmJTfqjrziDI6Z-EEQzdwSZOml2G8PN1O4Zo5SA,5970
+arize/embeddings/usecases.py,sha256=czoj5xk_WyYBsc9LE79JtkMbMN4RfKilwlm8pxl3Q_8,442
 arize/exceptions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arize/exceptions/auth.py,sha256=5hy7hhvgRCnqPACBTfG_0OICmcd9OlHQLHGhLysA6mA,403
 arize/exceptions/base.py,sha256=TWdtMulMi1Cg6X8nne_nlg8DY0zmLHb-hW9AbvjMGOs,3261
@@ -109,7 +118,9 @@ arize/utils/arrow.py,sha256=4In1gQc0i4Rb8zuwI0w-Hv-10wiItu5opqqGrJ8tSzo,5277
 arize/utils/casting.py,sha256=KUrPUQN6qJEVe39nxbr0T-0GjAJLHjf4xWuzV71QezI,12468
 arize/utils/dataframe.py,sha256=I0FloPgNiqlKga32tMOvTE70598QA8Hhrgf-6zjYMAM,1120
 arize/utils/proto.py,sha256=9vLo53INYjdF78ffjm3E48jFwK6LbPD2FfKei7VaDy8,35477
-arize-8.0.0a10.dist-info/METADATA,sha256=9u9UPm9jOeZp9pxLo9R5mDYvrACrOzbPET51mNyyXQU,12567
-arize-8.0.0a10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-arize-8.0.0a10.dist-info/licenses/LICENSE.md,sha256=8vLN8Gms62NCBorxIv9MUvuK7myueb6_-dhXHPmm4H0,1479
-arize-8.0.0a10.dist-info/RECORD,,
+arize/utils/online_tasks/__init__.py,sha256=nDuTLUTYnZaWgyJoYR1P7O8ZKA-Nba7X6tJ9OislbWM,144
+arize/utils/online_tasks/dataframe_preprocessor.py,sha256=YyeeeFu_FwCYImbYvBZvQIH_5TK2lHru8KSfqV893ps,8884
+arize-8.0.0a11.dist-info/METADATA,sha256=8VQP8JDh48Lbj07BqrntHaJjRdlalM7a5Zq3pv5s7E0,13842
+arize-8.0.0a11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+arize-8.0.0a11.dist-info/licenses/LICENSE.md,sha256=8vLN8Gms62NCBorxIv9MUvuK7myueb6_-dhXHPmm4H0,1479
+arize-8.0.0a11.dist-info/RECORD,,

{arize-8.0.0a10.dist-info → arize-8.0.0a11.dist-info}/WHEEL RENAMED Viewed

File without changes

{arize-8.0.0a10.dist-info → arize-8.0.0a11.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

arize 8.0.0a10__py3-none-any.whl → 8.0.0a11__py3-none-any.whl

arize 8.0.0a10py3-none-any.whl → 8.0.0a11py3-none-any.whl