arize 8.0.0a10__py3-none-any.whl → 8.0.0a11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
arize/client.py CHANGED
@@ -12,11 +12,13 @@ if TYPE_CHECKING:
12
12
  from arize.spans.client import SpansClient
13
13
 
14
14
 
15
+ # TODO(Kiko): experimental/datasets must be adapted into the datasets subclient
16
+ # TODO(Kiko): experimental/prompt hub is missing
17
+ # TODO(Kiko): exporter/utils/schema_parser is missing
15
18
  # TODO(Kiko): Go through main APIs and add CtxAdapter where missing
16
19
  # TODO(Kiko): Search and handle other TODOs
17
20
  # TODO(Kiko): Go over **every file** and do not import anything at runtime, use `if TYPE_CHECKING`
18
21
  # with `from __future__ import annotations` (must include for Python < 3.11)
19
- # TODO(Kiko): MIMIC Explainer not done
20
22
  # TODO(Kiko): Go over docstrings
21
23
  class ArizeClient(LazySubclientsMixin):
22
24
  """
@@ -0,0 +1,4 @@
1
+ from arize.embeddings.auto_generator import EmbeddingGenerator
2
+ from arize.embeddings.usecases import UseCases
3
+
4
+ __all__ = ["EmbeddingGenerator", "UseCases"]
@@ -0,0 +1,108 @@
1
+ from typing import Any
2
+
3
+ import pandas as pd
4
+
5
+ from arize.embeddings import constants
6
+ from arize.embeddings.base_generators import BaseEmbeddingGenerator
7
+ from arize.embeddings.constants import (
8
+ CV_PRETRAINED_MODELS,
9
+ DEFAULT_CV_IMAGE_CLASSIFICATION_MODEL,
10
+ DEFAULT_CV_OBJECT_DETECTION_MODEL,
11
+ DEFAULT_NLP_SEQUENCE_CLASSIFICATION_MODEL,
12
+ DEFAULT_NLP_SUMMARIZATION_MODEL,
13
+ DEFAULT_TABULAR_MODEL,
14
+ NLP_PRETRAINED_MODELS,
15
+ )
16
+ from arize.embeddings.cv_generators import (
17
+ EmbeddingGeneratorForCVImageClassification,
18
+ EmbeddingGeneratorForCVObjectDetection,
19
+ )
20
+ from arize.embeddings.nlp_generators import (
21
+ EmbeddingGeneratorForNLPSequenceClassification,
22
+ EmbeddingGeneratorForNLPSummarization,
23
+ )
24
+ from arize.embeddings.tabular_generators import (
25
+ EmbeddingGeneratorForTabularFeatures,
26
+ )
27
+ from arize.embeddings.usecases import UseCases
28
+
29
+ UseCaseLike = str | UseCases.NLP | UseCases.CV | UseCases.STRUCTURED
30
+
31
+
32
+ class EmbeddingGenerator:
33
+ def __init__(self, **kwargs: str):
34
+ raise OSError(
35
+ f"{self.__class__.__name__} is designed to be instantiated using the "
36
+ f"`{self.__class__.__name__}.from_use_case(use_case, **kwargs)` method."
37
+ )
38
+
39
+ @staticmethod
40
+ def from_use_case(
41
+ use_case: UseCaseLike, **kwargs: Any
42
+ ) -> BaseEmbeddingGenerator:
43
+ if use_case == UseCases.NLP.SEQUENCE_CLASSIFICATION:
44
+ return EmbeddingGeneratorForNLPSequenceClassification(**kwargs)
45
+ elif use_case == UseCases.NLP.SUMMARIZATION:
46
+ return EmbeddingGeneratorForNLPSummarization(**kwargs)
47
+ elif use_case == UseCases.CV.IMAGE_CLASSIFICATION:
48
+ return EmbeddingGeneratorForCVImageClassification(**kwargs)
49
+ elif use_case == UseCases.CV.OBJECT_DETECTION:
50
+ return EmbeddingGeneratorForCVObjectDetection(**kwargs)
51
+ elif use_case == UseCases.STRUCTURED.TABULAR_EMBEDDINGS:
52
+ return EmbeddingGeneratorForTabularFeatures(**kwargs)
53
+ else:
54
+ raise ValueError(f"Invalid use case {use_case}")
55
+
56
+ @classmethod
57
+ def list_default_models(cls) -> pd.DataFrame:
58
+ df = pd.DataFrame(
59
+ {
60
+ "Area": ["NLP", "NLP", "CV", "CV", "STRUCTURED"],
61
+ "Usecase": [
62
+ UseCases.NLP.SEQUENCE_CLASSIFICATION.name,
63
+ UseCases.NLP.SUMMARIZATION.name,
64
+ UseCases.CV.IMAGE_CLASSIFICATION.name,
65
+ UseCases.CV.OBJECT_DETECTION.name,
66
+ UseCases.STRUCTURED.TABULAR_EMBEDDINGS.name,
67
+ ],
68
+ "Model Name": [
69
+ DEFAULT_NLP_SEQUENCE_CLASSIFICATION_MODEL,
70
+ DEFAULT_NLP_SUMMARIZATION_MODEL,
71
+ DEFAULT_CV_IMAGE_CLASSIFICATION_MODEL,
72
+ DEFAULT_CV_OBJECT_DETECTION_MODEL,
73
+ DEFAULT_TABULAR_MODEL,
74
+ ],
75
+ }
76
+ )
77
+ df.sort_values(
78
+ by=[col for col in df.columns], ascending=True, inplace=True
79
+ )
80
+ return df.reset_index(drop=True)
81
+
82
+ @classmethod
83
+ def list_pretrained_models(cls) -> pd.DataFrame:
84
+ data = {
85
+ "Task": ["NLP" for _ in NLP_PRETRAINED_MODELS]
86
+ + ["CV" for _ in CV_PRETRAINED_MODELS],
87
+ "Architecture": [
88
+ cls.__parse_model_arch(model)
89
+ for model in NLP_PRETRAINED_MODELS + CV_PRETRAINED_MODELS
90
+ ],
91
+ "Model Name": NLP_PRETRAINED_MODELS + CV_PRETRAINED_MODELS,
92
+ }
93
+ df = pd.DataFrame(data)
94
+ df.sort_values(
95
+ by=[col for col in df.columns], ascending=True, inplace=True
96
+ )
97
+ return df.reset_index(drop=True)
98
+
99
+ @staticmethod
100
+ def __parse_model_arch(model_name: str) -> str:
101
+ if constants.GPT.lower() in model_name.lower():
102
+ return constants.GPT
103
+ elif constants.BERT.lower() in model_name.lower():
104
+ return constants.BERT
105
+ elif constants.VIT.lower() in model_name.lower():
106
+ return constants.VIT
107
+ else:
108
+ raise ValueError("Invalid model_name, unknown architecture.")
@@ -0,0 +1,255 @@
1
+ import os
2
+ from abc import ABC, abstractmethod
3
+ from enum import Enum
4
+ from functools import partial
5
+ from typing import Dict, List, Union, cast
6
+
7
+ import pandas as pd
8
+
9
+ import arize.embeddings.errors as err
10
+ from arize.embeddings.constants import IMPORT_ERROR_MESSAGE
11
+
12
+ try:
13
+ import torch
14
+ from datasets import Dataset
15
+ from PIL import Image
16
+ from transformers import ( # type: ignore
17
+ AutoImageProcessor,
18
+ AutoModel,
19
+ AutoTokenizer,
20
+ BatchEncoding,
21
+ )
22
+ from transformers.utils import logging as transformer_logging
23
+ except ImportError as e:
24
+ raise ImportError(IMPORT_ERROR_MESSAGE) from e
25
+
26
+ import logging
27
+
28
+ logger = logging.getLogger(__name__)
29
+ transformer_logging.set_verbosity(50)
30
+ transformer_logging.enable_progress_bar()
31
+
32
+
33
+ class BaseEmbeddingGenerator(ABC):
34
+ def __init__(
35
+ self, use_case: Enum, model_name: str, batch_size: int = 100, **kwargs
36
+ ):
37
+ self.__use_case = self._parse_use_case(use_case=use_case)
38
+ self.__model_name = model_name
39
+ self.__device = self.select_device()
40
+ self.__batch_size = batch_size
41
+ logger.info(f"Downloading pre-trained model '{self.model_name}'")
42
+ try:
43
+ self.__model = AutoModel.from_pretrained(
44
+ self.model_name, **kwargs
45
+ ).to(self.device)
46
+ except OSError as e:
47
+ raise err.HuggingFaceRepositoryNotFound(model_name) from e
48
+ except Exception as e:
49
+ raise e
50
+
51
+ @abstractmethod
52
+ def generate_embeddings(self, **kwargs) -> pd.Series: ...
53
+
54
+ def select_device(self) -> torch.device:
55
+ if torch.cuda.is_available():
56
+ return torch.device("cuda")
57
+ elif torch.backends.mps.is_available():
58
+ return torch.device("mps")
59
+ else:
60
+ logger.warning(
61
+ "No available GPU has been detected. The use of GPU acceleration is "
62
+ "strongly recommended. You can check for GPU availability by running "
63
+ "`torch.cuda.is_available()` or `torch.backends.mps.is_available()`."
64
+ )
65
+ return torch.device("cpu")
66
+
67
+ @property
68
+ def use_case(self) -> str:
69
+ return self.__use_case
70
+
71
+ @property
72
+ def model_name(self) -> str:
73
+ return self.__model_name
74
+
75
+ @property
76
+ def model(self):
77
+ return self.__model
78
+
79
+ @property
80
+ def device(self) -> torch.device:
81
+ return self.__device
82
+
83
+ @property
84
+ def batch_size(self) -> int:
85
+ return self.__batch_size
86
+
87
+ @batch_size.setter
88
+ def batch_size(self, new_batch_size: int) -> None:
89
+ err_message = "New batch size should be an integer greater than 0."
90
+ if not isinstance(new_batch_size, int):
91
+ raise TypeError(err_message)
92
+ elif new_batch_size <= 0:
93
+ raise ValueError(err_message)
94
+ else:
95
+ self.__batch_size = new_batch_size
96
+ logger.info(f"Batch size has been set to {new_batch_size}.")
97
+
98
+ @staticmethod
99
+ def _parse_use_case(use_case: Enum) -> str:
100
+ uc_area = use_case.__class__.__name__.split("UseCases")[0]
101
+ uc_task = use_case.name
102
+ return f"{uc_area}.{uc_task}"
103
+
104
+ def _get_embedding_vector(
105
+ self, batch: Dict[str, torch.Tensor], method
106
+ ) -> Dict[str, torch.Tensor]:
107
+ with torch.no_grad():
108
+ outputs = self.model(**batch)
109
+ # (batch_size, seq_length/or/num_tokens, hidden_size)
110
+ if method == "cls_token": # Select CLS token vector
111
+ embeddings = outputs.last_hidden_state[:, 0, :]
112
+ elif method == "avg_token": # Select avg token vector
113
+ embeddings = torch.mean(outputs.last_hidden_state, 1)
114
+ else:
115
+ raise ValueError(f"Invalid method = {method}")
116
+ return {"embedding_vector": embeddings.cpu().numpy().astype(float)}
117
+
118
+ @staticmethod
119
+ def check_invalid_index(field: Union[pd.Series, pd.DataFrame]) -> None:
120
+ if (field.index != field.reset_index(drop=True).index).any():
121
+ if isinstance(field, pd.DataFrame):
122
+ raise err.InvalidIndexError("DataFrame")
123
+ else:
124
+ raise err.InvalidIndexError(str(field.name))
125
+
126
+ @abstractmethod
127
+ def __repr__(self) -> str:
128
+ pass
129
+
130
+
131
+ class NLPEmbeddingGenerator(BaseEmbeddingGenerator):
132
+ def __repr__(self) -> str:
133
+ return (
134
+ f"{self.__class__.__name__}(\n"
135
+ f" use_case={self.use_case},\n"
136
+ f" model_name='{self.model_name}',\n"
137
+ f" tokenizer_max_length={self.tokenizer_max_length},\n"
138
+ f" tokenizer={self.tokenizer.__class__},\n"
139
+ f" model={self.model.__class__},\n"
140
+ f" batch_size={self.batch_size},\n"
141
+ f")"
142
+ )
143
+
144
+ def __init__(
145
+ self,
146
+ use_case: Enum,
147
+ model_name: str,
148
+ tokenizer_max_length: int = 512,
149
+ **kwargs,
150
+ ):
151
+ super().__init__(use_case=use_case, model_name=model_name, **kwargs)
152
+ self.__tokenizer_max_length = tokenizer_max_length
153
+ # We don't check for the tokenizer's existence since it is coupled with the corresponding model
154
+ # We check the model's existence in `BaseEmbeddingGenerator`
155
+ logger.info(f"Downloading tokenizer for '{self.model_name}'")
156
+ self.__tokenizer = AutoTokenizer.from_pretrained(
157
+ self.model_name, model_max_length=self.tokenizer_max_length
158
+ )
159
+
160
+ @property
161
+ def tokenizer(self):
162
+ return self.__tokenizer
163
+
164
+ @property
165
+ def tokenizer_max_length(self) -> int:
166
+ return self.__tokenizer_max_length
167
+
168
+ def tokenize(
169
+ self, batch: Dict[str, List[str]], text_feat_name: str
170
+ ) -> BatchEncoding:
171
+ return self.tokenizer(
172
+ batch[text_feat_name],
173
+ padding=True,
174
+ truncation=True,
175
+ max_length=self.tokenizer_max_length,
176
+ return_tensors="pt",
177
+ ).to(self.device)
178
+
179
+
180
+ class CVEmbeddingGenerator(BaseEmbeddingGenerator):
181
+ def __repr__(self) -> str:
182
+ return (
183
+ f"{self.__class__.__name__}(\n"
184
+ f" use_case={self.use_case},\n"
185
+ f" model_name='{self.model_name}',\n"
186
+ f" image_processor={self.image_processor.__class__},\n"
187
+ f" model={self.model.__class__},\n"
188
+ f" batch_size={self.batch_size},\n"
189
+ f")"
190
+ )
191
+
192
+ def __init__(self, use_case: Enum, model_name: str, **kwargs):
193
+ super().__init__(use_case=use_case, model_name=model_name, **kwargs)
194
+ logger.info("Downloading image processor")
195
+ # We don't check for the image processor's existence since it is coupled with the corresponding model
196
+ # We check the model's existence in `BaseEmbeddingGenerator`
197
+ self.__image_processor = AutoImageProcessor.from_pretrained(
198
+ self.model_name
199
+ )
200
+
201
+ @property
202
+ def image_processor(self):
203
+ return self.__image_processor
204
+
205
+ @staticmethod
206
+ def open_image(image_path: str) -> Image.Image:
207
+ if not os.path.exists(image_path):
208
+ raise ValueError(f"Cannot find image {image_path}")
209
+ return Image.open(image_path).convert("RGB")
210
+
211
+ def preprocess_image(
212
+ self, batch: Dict[str, List[str]], local_image_feat_name: str
213
+ ):
214
+ return self.image_processor(
215
+ [
216
+ self.open_image(image_path)
217
+ for image_path in batch[local_image_feat_name]
218
+ ],
219
+ return_tensors="pt",
220
+ ).to(self.device)
221
+
222
+ def generate_embeddings(self, local_image_path_col: pd.Series) -> pd.Series:
223
+ """
224
+ Obtain embedding vectors from your image data using pre-trained image models.
225
+
226
+ :param local_image_path_col: a pandas Series containing the local path to the images to
227
+ be used to generate the embedding vectors.
228
+ :return: a pandas Series containing the embedding vectors.
229
+ """
230
+ if not isinstance(local_image_path_col, pd.Series):
231
+ raise TypeError(
232
+ "local_image_path_col_name must be pandas Series object"
233
+ )
234
+ self.check_invalid_index(field=local_image_path_col)
235
+
236
+ # Validate that there are no null image paths
237
+ if local_image_path_col.isnull().any():
238
+ raise ValueError(
239
+ "There can't be any null values in the local_image_path_col series"
240
+ )
241
+
242
+ ds = Dataset.from_dict({"local_path": local_image_path_col})
243
+ ds.set_transform(
244
+ partial(
245
+ self.preprocess_image,
246
+ local_image_feat_name="local_path",
247
+ )
248
+ )
249
+ logger.info("Generating embedding vectors")
250
+ ds = ds.map(
251
+ lambda batch: self._get_embedding_vector(batch, "avg_token"),
252
+ batched=True,
253
+ batch_size=self.batch_size,
254
+ )
255
+ return cast(pd.DataFrame, ds.to_pandas())["embedding_vector"]
@@ -0,0 +1,34 @@
1
+ DEFAULT_NLP_SEQUENCE_CLASSIFICATION_MODEL = "distilbert-base-uncased"
2
+ DEFAULT_NLP_SUMMARIZATION_MODEL = "distilbert-base-uncased"
3
+ DEFAULT_TABULAR_MODEL = "distilbert-base-uncased"
4
+ DEFAULT_CV_IMAGE_CLASSIFICATION_MODEL = "google/vit-base-patch32-224-in21k"
5
+ DEFAULT_CV_OBJECT_DETECTION_MODEL = "facebook/detr-resnet-101"
6
+ NLP_PRETRAINED_MODELS = [
7
+ "bert-base-cased",
8
+ "bert-base-uncased",
9
+ "bert-large-cased",
10
+ "bert-large-uncased",
11
+ "distilbert-base-cased",
12
+ "distilbert-base-uncased",
13
+ "xlm-roberta-base",
14
+ "xlm-roberta-large",
15
+ ]
16
+
17
+ CV_PRETRAINED_MODELS = [
18
+ "google/vit-base-patch16-224-in21k",
19
+ "google/vit-base-patch16-384",
20
+ "google/vit-base-patch32-224-in21k",
21
+ "google/vit-base-patch32-384",
22
+ "google/vit-large-patch16-224-in21k",
23
+ "google/vit-large-patch16-384",
24
+ "google/vit-large-patch32-224-in21k",
25
+ "google/vit-large-patch32-384",
26
+ ]
27
+ IMPORT_ERROR_MESSAGE = (
28
+ "To enable embedding generation, the arize module must be installed with "
29
+ "extra dependencies. Run: pip install 'arize[auto-embeddings]'."
30
+ )
31
+
32
+ GPT = "GPT"
33
+ BERT = "BERT"
34
+ VIT = "ViT"
@@ -0,0 +1,28 @@
1
+ from arize.embeddings.base_generators import CVEmbeddingGenerator
2
+ from arize.embeddings.constants import (
3
+ DEFAULT_CV_IMAGE_CLASSIFICATION_MODEL,
4
+ DEFAULT_CV_OBJECT_DETECTION_MODEL,
5
+ )
6
+ from arize.embeddings.usecases import UseCases
7
+
8
+
9
+ class EmbeddingGeneratorForCVImageClassification(CVEmbeddingGenerator):
10
+ def __init__(
11
+ self, model_name: str = DEFAULT_CV_IMAGE_CLASSIFICATION_MODEL, **kwargs
12
+ ):
13
+ super().__init__(
14
+ use_case=UseCases.CV.IMAGE_CLASSIFICATION,
15
+ model_name=model_name,
16
+ **kwargs,
17
+ )
18
+
19
+
20
+ class EmbeddingGeneratorForCVObjectDetection(CVEmbeddingGenerator):
21
+ def __init__(
22
+ self, model_name: str = DEFAULT_CV_OBJECT_DETECTION_MODEL, **kwargs
23
+ ):
24
+ super().__init__(
25
+ use_case=UseCases.CV.OBJECT_DETECTION,
26
+ model_name=model_name,
27
+ **kwargs,
28
+ )
@@ -0,0 +1,41 @@
1
+ class InvalidIndexError(Exception):
2
+ def __repr__(self) -> str:
3
+ return "Invalid_Index_Error"
4
+
5
+ def __str__(self) -> str:
6
+ return self.error_message()
7
+
8
+ def __init__(self, field_name: str) -> None:
9
+ self.field_name = field_name
10
+
11
+ def error_message(self) -> str:
12
+ if self.field_name == "DataFrame":
13
+ return (
14
+ f"The index of the {self.field_name} is invalid; "
15
+ f"reset the index by using df.reset_index(drop=True, inplace=True)"
16
+ )
17
+ else:
18
+ return (
19
+ f"The index of the Series given by the column '{self.field_name}' is invalid; "
20
+ f"reset the index by using df.reset_index(drop=True, inplace=True)"
21
+ )
22
+
23
+
24
+ class HuggingFaceRepositoryNotFound(Exception):
25
+ def __repr__(self) -> str:
26
+ return "HuggingFace_Repository_Not_Found_Error"
27
+
28
+ def __str__(self) -> str:
29
+ return self.error_message()
30
+
31
+ def __init__(self, model_name: str) -> None:
32
+ self.model_name = model_name
33
+
34
+ def error_message(self) -> str:
35
+ return (
36
+ f"The given model name '{self.model_name}' is not a valid model identifier listed on "
37
+ "'https://huggingface.co/models'. "
38
+ "If this is a private repository, log in with `huggingface-cli login` or importing "
39
+ "`login` from `huggingface_hub` if you are using a notebook. "
40
+ "Learn more in https://huggingface.co/docs/huggingface_hub/quick-start#login"
41
+ )
@@ -0,0 +1,111 @@
1
+ import logging
2
+ from functools import partial
3
+ from typing import Optional, cast
4
+
5
+ import pandas as pd
6
+
7
+ from arize.embeddings.base_generators import NLPEmbeddingGenerator
8
+ from arize.embeddings.constants import (
9
+ DEFAULT_NLP_SEQUENCE_CLASSIFICATION_MODEL,
10
+ DEFAULT_NLP_SUMMARIZATION_MODEL,
11
+ IMPORT_ERROR_MESSAGE,
12
+ )
13
+ from arize.embeddings.usecases import UseCases
14
+
15
+ try:
16
+ from datasets import Dataset
17
+ except ImportError:
18
+ raise ImportError(IMPORT_ERROR_MESSAGE) from None
19
+
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class EmbeddingGeneratorForNLPSequenceClassification(NLPEmbeddingGenerator):
25
+ def __init__(
26
+ self,
27
+ model_name: str = DEFAULT_NLP_SEQUENCE_CLASSIFICATION_MODEL,
28
+ **kwargs,
29
+ ):
30
+ super().__init__(
31
+ use_case=UseCases.NLP.SEQUENCE_CLASSIFICATION,
32
+ model_name=model_name,
33
+ **kwargs,
34
+ )
35
+
36
+ def generate_embeddings(
37
+ self,
38
+ text_col: pd.Series,
39
+ class_label_col: Optional[pd.Series] = None,
40
+ ) -> pd.Series:
41
+ """
42
+ Obtain embedding vectors from your text data using pre-trained large language models.
43
+
44
+ :param text_col: a pandas Series containing the different pieces of text.
45
+ :param class_label_col: if this column is passed, the sentence "The classification label
46
+ is <class_label>" will be appended to the text in the `text_col`.
47
+ :return: a pandas Series containing the embedding vectors.
48
+ """
49
+ if not isinstance(text_col, pd.Series):
50
+ raise TypeError("text_col must be a pandas Series")
51
+
52
+ self.check_invalid_index(field=text_col)
53
+
54
+ if class_label_col is not None:
55
+ if not isinstance(class_label_col, pd.Series):
56
+ raise TypeError("class_label_col must be a pandas Series")
57
+ df = pd.concat(
58
+ {"text": text_col, "class_label": class_label_col}, axis=1
59
+ )
60
+ prepared_text_col = df.apply(
61
+ lambda row: f" The classification label is {row['class_label']}. {row['text']}",
62
+ axis=1,
63
+ )
64
+ ds = Dataset.from_dict({"text": prepared_text_col})
65
+ else:
66
+ ds = Dataset.from_dict({"text": text_col})
67
+
68
+ ds.set_transform(partial(self.tokenize, text_feat_name="text"))
69
+ logger.info("Generating embedding vectors")
70
+ ds = ds.map(
71
+ lambda batch: self._get_embedding_vector(batch, "cls_token"),
72
+ batched=True,
73
+ batch_size=self.batch_size,
74
+ )
75
+ return cast(pd.DataFrame, ds.to_pandas())["embedding_vector"]
76
+
77
+
78
+ class EmbeddingGeneratorForNLPSummarization(NLPEmbeddingGenerator):
79
+ def __init__(
80
+ self, model_name: str = DEFAULT_NLP_SUMMARIZATION_MODEL, **kwargs
81
+ ):
82
+ super().__init__(
83
+ use_case=UseCases.NLP.SUMMARIZATION,
84
+ model_name=model_name,
85
+ **kwargs,
86
+ )
87
+
88
+ def generate_embeddings(
89
+ self,
90
+ text_col: pd.Series,
91
+ ) -> pd.Series:
92
+ """
93
+ Obtain embedding vectors from your text data using pre-trained large language models.
94
+
95
+ :param text_col: a pandas Series containing the different pieces of text.
96
+ :return: a pandas Series containing the embedding vectors.
97
+ """
98
+ if not isinstance(text_col, pd.Series):
99
+ raise TypeError("text_col must be a pandas Series")
100
+ self.check_invalid_index(field=text_col)
101
+
102
+ ds = Dataset.from_dict({"text": text_col})
103
+
104
+ ds.set_transform(partial(self.tokenize, text_feat_name="text"))
105
+ logger.info("Generating embedding vectors")
106
+ ds = ds.map(
107
+ lambda batch: self._get_embedding_vector(batch, "cls_token"),
108
+ batched=True,
109
+ batch_size=self.batch_size,
110
+ )
111
+ return cast(pd.DataFrame, ds.to_pandas())["embedding_vector"]
@@ -0,0 +1,161 @@
1
+ import logging
2
+ from functools import partial
3
+ from typing import Dict, List, Optional, Tuple, Union, cast
4
+
5
+ import pandas as pd
6
+
7
+ from arize.embeddings.base_generators import NLPEmbeddingGenerator
8
+ from arize.embeddings.constants import (
9
+ DEFAULT_TABULAR_MODEL,
10
+ IMPORT_ERROR_MESSAGE,
11
+ )
12
+ from arize.embeddings.usecases import UseCases
13
+ from arize.types import is_list_of
14
+
15
+ try:
16
+ from datasets import Dataset
17
+ except ImportError:
18
+ raise ImportError(IMPORT_ERROR_MESSAGE) from None
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ TABULAR_PRETRAINED_MODELS = [
23
+ "bert-base-uncased",
24
+ "distilbert-base-uncased",
25
+ "xlm-roberta-base",
26
+ ]
27
+
28
+
29
+ class EmbeddingGeneratorForTabularFeatures(NLPEmbeddingGenerator):
30
+ def __repr__(self) -> str:
31
+ return (
32
+ f"{self.__class__.__name__}(\n"
33
+ f" use_case={self.use_case},\n"
34
+ f" model_name={self.model_name},\n"
35
+ f" tokenizer_max_length={self.tokenizer_max_length},\n"
36
+ f" tokenizer={self.tokenizer.__class__},\n"
37
+ f" model={self.model.__class__},\n"
38
+ f")"
39
+ )
40
+
41
+ def __init__(
42
+ self,
43
+ model_name: str = DEFAULT_TABULAR_MODEL,
44
+ **kwargs,
45
+ ):
46
+ if model_name not in TABULAR_PRETRAINED_MODELS:
47
+ raise ValueError(
48
+ "model_name not supported. Check supported models with "
49
+ "`EmbeddingGeneratorForTabularFeatures.list_pretrained_models()`"
50
+ )
51
+ super().__init__(
52
+ use_case=UseCases.STRUCTURED.TABULAR_EMBEDDINGS,
53
+ model_name=model_name,
54
+ **kwargs,
55
+ )
56
+
57
+ def generate_embeddings(
58
+ self,
59
+ df: pd.DataFrame,
60
+ selected_columns: List[str],
61
+ col_name_map: Optional[Dict[str, str]] = None,
62
+ return_prompt_col: bool = False,
63
+ ) -> Union[pd.Series, Tuple[pd.Series, pd.Series]]:
64
+ """
65
+ Obtain embedding vectors from your tabular data. Prompts are generated from your
66
+ `selected_columns` and passed to a pre-trained large language model for embedding vector
67
+ computation.
68
+
69
+ :param df: pandas DataFrame containing the tabular data, not all columns will be
70
+ considered, see `selected_columns`.
71
+ :param selected_columns: columns to be considered to construct the prompt to be passed to
72
+ the LLM.
73
+ :param col_name_map: mapping between selected column names and a more verbose description of
74
+ the name. This helps the LLM understand the features better.
75
+ :param return_prompt_col: if set to True, an extra pandas Series will be returned
76
+ containing the constructed prompts. Defaults to False.
77
+ :return: a pandas Series containing the embedding vectors and, if `return_prompt_col` is
78
+ set to True, a pandas Seres containing the prompts created from tabular features.
79
+ """
80
+ if col_name_map is None:
81
+ col_name_map = {}
82
+ if not isinstance(df, pd.DataFrame):
83
+ raise TypeError("df must be a pandas DataFrame")
84
+ self.check_invalid_index(field=df)
85
+
86
+ if not is_list_of(selected_columns, str):
87
+ raise TypeError("columns must be a list of column names (strings)")
88
+ missing_cols = set(selected_columns).difference(df.columns)
89
+ if missing_cols:
90
+ raise ValueError(
91
+ "selected_columns list must only contain columns of the dataframe. "
92
+ f"The following columns are not found {missing_cols}"
93
+ )
94
+
95
+ if not isinstance(col_name_map, dict):
96
+ raise TypeError(
97
+ "col_name_map must be a dictionary mapping column names to new column "
98
+ "names"
99
+ )
100
+ for k, v in col_name_map.items():
101
+ if not isinstance(k, str) or not isinstance(v, str):
102
+ raise ValueError(
103
+ "col_name_map dictionary keys and values should be strings"
104
+ )
105
+ missing_cols = set(col_name_map.keys()).difference(df.columns)
106
+ if missing_cols:
107
+ raise ValueError(
108
+ "col_name_map must only contain keys which are columns of the dataframe. "
109
+ f"The following columns are not found {missing_cols}"
110
+ )
111
+
112
+ prompts = df.rename(columns=col_name_map).apply(
113
+ partial(
114
+ self.__prompt_fn,
115
+ columns=[
116
+ col_name_map.get(col, col) for col in selected_columns
117
+ ],
118
+ ),
119
+ axis=1,
120
+ )
121
+ ds = Dataset.from_dict({"prompt": prompts})
122
+ ds.set_transform(partial(self.tokenize, text_feat_name="prompt"))
123
+ logger.info("Generating embedding vectors")
124
+ ds = ds.map(
125
+ lambda batch: self._get_embedding_vector(
126
+ batch, self.__get_method_for_embedding_calculation()
127
+ ),
128
+ batched=True,
129
+ batch_size=self.batch_size,
130
+ )
131
+
132
+ if return_prompt_col:
133
+ return (
134
+ cast(pd.DataFrame, ds.to_pandas())["embedding_vector"],
135
+ cast(pd.Series, prompts),
136
+ )
137
+
138
+ return cast(pd.DataFrame, ds.to_pandas())["embedding_vector"]
139
+
140
+ @staticmethod
141
+ def __prompt_fn(row: pd.DataFrame, columns: List[str]) -> str:
142
+ return " ".join(
143
+ f"The {col.replace('_', ' ')} is {str(row[col]).strip()}."
144
+ for col in columns
145
+ )
146
+
147
+ def __get_method_for_embedding_calculation(self):
148
+ try:
149
+ return {
150
+ "bert-base-uncased": "avg_token",
151
+ "distilbert-base-uncased": "avg_token",
152
+ "xlm-roberta-base": "cls_token",
153
+ }[self.model_name]
154
+ except Exception as exc:
155
+ raise ValueError(
156
+ f"Unsupported model_name {self.model_name}"
157
+ ) from exc
158
+
159
+ @staticmethod
160
+ def list_pretrained_models() -> pd.DataFrame:
161
+ return pd.DataFrame({"Model Name": sorted(TABULAR_PRETRAINED_MODELS)})
@@ -0,0 +1,26 @@
1
+ from dataclasses import dataclass
2
+ from enum import Enum, auto, unique
3
+
4
+
5
+ @unique
6
+ class NLPUseCases(Enum):
7
+ SEQUENCE_CLASSIFICATION = auto()
8
+ SUMMARIZATION = auto()
9
+
10
+
11
+ @unique
12
+ class CVUseCases(Enum):
13
+ IMAGE_CLASSIFICATION = auto()
14
+ OBJECT_DETECTION = auto()
15
+
16
+
17
+ @unique
18
+ class TabularUsecases(Enum):
19
+ TABULAR_EMBEDDINGS = auto()
20
+
21
+
22
+ @dataclass
23
+ class UseCases:
24
+ NLP = NLPUseCases
25
+ CV = CVUseCases
26
+ STRUCTURED = TabularUsecases
@@ -0,0 +1,5 @@
1
+ from arize.utils.online_tasks.dataframe_preprocessor import (
2
+ extract_nested_data_to_column,
3
+ )
4
+
5
+ __all__ = ["extract_nested_data_to_column"]
@@ -0,0 +1,235 @@
1
+ import json
2
+ import logging
3
+ from typing import Any, List, Tuple
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ def extract_nested_data_to_column(
12
+ attributes: List[str], df: pd.DataFrame
13
+ ) -> pd.DataFrame:
14
+ """
15
+ This function, used in Online Tasks, is typically run on data exported from Arize.
16
+ It prepares the DataFrame by extracting relevant attributes from complex, deeply nested
17
+ data structures, such as those found in LLM outputs or JSON-like records. It helps extract
18
+ specific values from these nested structures by identifying the longest matching column name
19
+ in the DataFrame and recursively accessing the desired attribute path within each row.
20
+ This preprocessing step ensures that the extracted values are available as new columns,
21
+ allowing evaluators to process and assess these values effectively.
22
+
23
+ For each attributes string in `attributes` (e.g. "attributes.llm.output_messages.0.message.content"),
24
+ 1) Find the largest prefix that is actually a column name in `df`. (e.g. "attributes.llm.output_messages")
25
+ 2) Use the remainder of the attribute as the introspect path for the values in that column:
26
+ - Calls _introspect_arize_attribute({row_value}, {attribute_remainder}) for each row value
27
+ e.g. {row_value} = [{'message.role': 'assistant',
28
+ 'message.content': 'The capital of China is Beijing.'}]
29
+ e.g. {attribute_remainder} = "0.message.content"
30
+ - This introspect function recursively indexes into a given row_value based on
31
+ the attribute_remainder path and is able to handle a variety of nested structures
32
+ such as the example given for {row_value}
33
+ 3) Create a new column named exactly `attribute`, filling it row-by-row with the result
34
+ of introspecting into the column's value. (e.g. row extracted: 'The capital of China is Beijing.')
35
+ If introspect fails or yields None, store NaN.
36
+ 4) After all columns have been created, drop rows that have NaN in *any* of the newly-created columns.
37
+ 5) Log how many rows were dropped and, if zero rows remain, log a message indicating that
38
+ there are no rows satisfying *all* of the queries.
39
+ """
40
+
41
+ # Make a copy so as not to alter the input df
42
+ result_df = df.copy()
43
+
44
+ # Keep track of which new columns we add. Each column name will match each user-inputted attribute
45
+ # (e.g. "attributes.llm.output_messages.0.message.content")
46
+ new_cols: List[str] = []
47
+
48
+ for attribute in attributes:
49
+ parts = attribute.split(".")
50
+ prefix_col = None
51
+ prefix_len = 0
52
+
53
+ # 1) Find largest prefix of attribute that matches a column in df
54
+ for i in range(1, len(parts) + 1):
55
+ candidate = ".".join(parts[:i])
56
+ if candidate in result_df.columns:
57
+ prefix_col = candidate
58
+ prefix_len = i
59
+
60
+ if prefix_col is None:
61
+ raise Exception("No such column found in DataFrame.")
62
+
63
+ # 2) The remainder after the prefix
64
+ remainder = ".".join(parts[prefix_len:])
65
+
66
+ # 3) Apply introspect row-by-row
67
+ def apply_introspect_arize_attribute(
68
+ row: pd.Series,
69
+ prefix_col: str = prefix_col,
70
+ remainder: str = remainder,
71
+ ) -> Any:
72
+ val = row[prefix_col]
73
+ try:
74
+ result = _introspect_arize_attribute(val, remainder)
75
+ return result if result is not None else np.nan
76
+ except Exception:
77
+ return np.nan
78
+
79
+ result_df[attribute] = result_df.apply(
80
+ apply_introspect_arize_attribute, axis=1
81
+ )
82
+
83
+ new_cols.append(attribute)
84
+
85
+ # 4) Drop rows that are NaN in *any* of the newly-added columns
86
+ rows_before = len(df)
87
+ result_df = result_df.dropna(subset=new_cols)
88
+ rows_after = len(result_df)
89
+ rows_dropped = rows_before - rows_after
90
+
91
+ # 5) Log some diagnostics
92
+ logger.info(f"Rows before processing: {rows_before}")
93
+ logger.info(f"Rows after processing: {rows_after}")
94
+ logger.info(f"Rows dropped: {rows_dropped}")
95
+
96
+ if rows_after == 0:
97
+ logger.info(
98
+ f"For the given filter, there are no rows that have ALL of the following variables: {attributes}"
99
+ )
100
+
101
+ return result_df
102
+
103
+
104
+ def _introspect_arize_attribute(value: Any, attribute: str) -> Any:
105
+ """
106
+ Recursively drill into `value` following the dot-delimited `attribute`.
107
+ Example:
108
+ value: [{'message.role': 'assistant', 'message.content': 'The capital of China is Beijing.'}]
109
+ attribute: "0.message.content"
110
+ Returns: 'The capital of China is Beijing.'
111
+
112
+ - Returns None immediately when a key or index is not found
113
+ - Handles integer parts for lists
114
+ - Parses JSON strings
115
+ - Converts NumPy arrays to lists
116
+ - Allows dotted keys (e.g. "message.content") by combining parts
117
+
118
+ """
119
+ if not attribute:
120
+ return value
121
+
122
+ attribute_parts = attribute.split(".")
123
+ return _introspect_arize_attribute_parts(value, attribute_parts)
124
+
125
+
126
+ def _introspect_arize_attribute_parts(
127
+ current_value: Any, attribute_parts_unprocessed: List[str]
128
+ ) -> Any:
129
+ # If no more parts, we return whatever we have
130
+ if not attribute_parts_unprocessed:
131
+ return current_value
132
+
133
+ current_value = _ensure_deserialized(current_value)
134
+
135
+ # Parse out the next value using the first (or combined) part(s).
136
+ parsed_value, num_parts_processed = _parse_value(
137
+ current_value, attribute_parts_unprocessed
138
+ )
139
+
140
+ # If we can't find a match, immediately return None
141
+ if parsed_value is None:
142
+ return None
143
+
144
+ # Otherwise, recurse deeper with the leftover parts
145
+ return _introspect_arize_attribute_parts(
146
+ parsed_value, attribute_parts_unprocessed[num_parts_processed:]
147
+ )
148
+
149
+
150
+ def _parse_value(
151
+ current_value: Any, attribute_parts_unprocessed: List[str]
152
+ ) -> Tuple[Any, int]:
153
+ """
154
+ Attempt to parse out the next value from `current_value` using the earliest parts:
155
+
156
+ 1) If `attribute_parts_unprocessed[0]` is an integer index and `current_value` is a list/tuple,
157
+ index into it.
158
+ 2) Else if `current_value` is a dict, check if `attribute_parts_unprocessed[0]` is a key.
159
+ If not found, try combining `attribute_parts_unprocessed[0] + '.' + attribute_parts_unprocessed[1]`...
160
+ to handle dotted keys in the dict.
161
+ 3) If none match, return (None, 1) to signal "not found, consume 1 part."
162
+
163
+ Returns (parsed_value, num_parts_processed):
164
+ - parsed_value: the found value or None if not found
165
+ - num_parts_processed: how many parts were processed (1 or more)
166
+ """
167
+
168
+ if not attribute_parts_unprocessed:
169
+ return (None, 0)
170
+
171
+ key = attribute_parts_unprocessed[
172
+ 0
173
+ ] # If key is an int, then it represents a list index
174
+ num_parts_processed = (
175
+ 1 # By default, we're at least consuming this first part
176
+ )
177
+
178
+ # 1) Try integer index (e.g. "0" => 0)
179
+ idx = _try_int(key)
180
+ if idx is not None:
181
+ # Must be a tuple or list (_ensure_deserialized() already casts numpy arrays to python lists)
182
+ if isinstance(current_value, (list, tuple)):
183
+ if 0 <= idx < len(current_value):
184
+ return (current_value[idx], num_parts_processed)
185
+ else:
186
+ return (None, num_parts_processed)
187
+ else:
188
+ return (None, num_parts_processed)
189
+
190
+ # 2) Try dict approach
191
+ if isinstance(current_value, dict):
192
+ # a) direct match
193
+ if key in current_value:
194
+ return (current_value[key], num_parts_processed)
195
+ else:
196
+ # b) try combining multiple parts to handle dotted key
197
+ for num_parts_processed in range(
198
+ 1, len(attribute_parts_unprocessed)
199
+ ):
200
+ key += "." + attribute_parts_unprocessed[num_parts_processed]
201
+ if key in current_value:
202
+ return (
203
+ current_value[key],
204
+ num_parts_processed + 1,
205
+ )
206
+ return (None, num_parts_processed)
207
+
208
+ # If we get here, we couldn't handle it (not a list or dict or mismatch)
209
+ return (None, num_parts_processed)
210
+
211
+
212
+ def _ensure_deserialized(val: Any) -> Any:
213
+ """
214
+ 1) If `val` is a numpy array, convert to a Python list.
215
+ 2) If `val` is a string, attempt to parse as JSON.
216
+ 3) Otherwise return as-is.
217
+ """
218
+ if isinstance(val, np.ndarray):
219
+ val = val.tolist()
220
+
221
+ if isinstance(val, str):
222
+ try:
223
+ return json.loads(val)
224
+ except (json.JSONDecodeError, TypeError, ValueError):
225
+ pass
226
+
227
+ return val
228
+
229
+
230
+ def _try_int(s: str) -> int | None:
231
+ """Attempt to convert s to int, return None on failure."""
232
+ try:
233
+ return int(s)
234
+ except ValueError:
235
+ return None
arize/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "8.0.0a10"
1
+ __version__ = "8.0.0a11"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arize
3
- Version: 8.0.0a10
3
+ Version: 8.0.0a11
4
4
  Summary: A helper library to interact with Arize AI APIs
5
5
  Project-URL: Homepage, https://arize.com
6
6
  Project-URL: Documentation, https://docs.arize.com/arize
@@ -27,6 +27,13 @@ Classifier: Topic :: System :: Monitoring
27
27
  Requires-Python: >=3.10
28
28
  Requires-Dist: lazy-imports
29
29
  Requires-Dist: numpy>=2.0.0
30
+ Provides-Extra: auto-embeddings
31
+ Requires-Dist: datasets!=2.14.*,<3,>=2.8; extra == 'auto-embeddings'
32
+ Requires-Dist: pandas<3,>=1.0.0; extra == 'auto-embeddings'
33
+ Requires-Dist: pillow<11,>=8.4.0; extra == 'auto-embeddings'
34
+ Requires-Dist: tokenizers<1,>=0.13; extra == 'auto-embeddings'
35
+ Requires-Dist: torch<3,>=1.13; extra == 'auto-embeddings'
36
+ Requires-Dist: transformers<5,>=4.25; extra == 'auto-embeddings'
30
37
  Provides-Extra: dev
31
38
  Requires-Dist: pytest==8.4.2; extra == 'dev'
32
39
  Requires-Dist: ruff==0.13.2; extra == 'dev'
@@ -84,6 +91,7 @@ Description-Content-Type: text/markdown
84
91
  - [Stream log ML Data for a Classification use-case](#stream-log-ml-data-for-a-classification-use-case)
85
92
  - [Log a batch of ML Data for a Object Detection use-case](#log-a-batch-of-ml-data-for-a-object-detection-use-case)
86
93
  - [Exporting ML Data](#exporting-ml-data)
94
+ - [Generate embeddings for your data](#generate-embeddings-for-your-data)
87
95
  - [Community](#community)
88
96
 
89
97
  # Overview
@@ -326,6 +334,38 @@ df = client.models.export_to_df(
326
334
  )
327
335
  ```
328
336
 
337
+ ## Generate embeddings for your data
338
+
339
+ ```python
340
+ import pandas as pd
341
+ from arize.embeddings import EmbeddingGenerator, UseCases
342
+
343
+ # You can check available models
344
+ print(EmbeddingGenerator.list_pretrained_models())
345
+
346
+ # Example dataframe
347
+ df = pd.DataFrame(
348
+ {
349
+ "text": [
350
+ "Hello world.",
351
+ "Artificial Intelligence is the future.",
352
+ "Spain won the FIFA World Cup on 2010.",
353
+ ],
354
+ }
355
+ )
356
+ # Instantiate the generator for your usecase, selecting the base model
357
+ generator = EmbeddingGenerator.from_use_case(
358
+ use_case=UseCases.NLP.SEQUENCE_CLASSIFICATION,
359
+ model_name="distilbert-base-uncased",
360
+ tokenizer_max_length=512,
361
+ batch_size=100,
362
+ )
363
+
364
+ # Generate embeddings
365
+ df["text_vector"] = generator.generate_embeddings(text_col=df["text"])
366
+ ```
367
+
368
+
329
369
  # Community
330
370
 
331
371
  Join our community to connect with thousands of AI builders.
@@ -1,10 +1,10 @@
1
1
  arize/__init__.py,sha256=-4bbbZwcjGS9OfAunsB-lmKRCzccPdFvZmvJQJEky3E,534
2
2
  arize/_lazy.py,sha256=MVep6D93sJWvArg4pgm4CVNGc6tu-XRK_Z7EDMuc76I,2358
3
- arize/client.py,sha256=0LtZU3WeEatGd1QgQsMrJOuI-tFmzM3y1AfO74BLJys,5716
3
+ arize/client.py,sha256=kDdOWC1rwYgPPExO3wT3-KU3qpMwQ0ogrAdjvf7Ls3M,5860
4
4
  arize/config.py,sha256=iynVEZhrOPdTNJTQ_KQmwKOPiwL0LfEP8AUIDYW86Xw,5801
5
5
  arize/logging.py,sha256=2vwdta2-kR78GeBFGK2vpk51rQ2d06HoKzuARI9qFQk,7317
6
6
  arize/types.py,sha256=z1yg5-brmTD4kVHDmmTVkYke53JpusXXeOOpdQw7rYg,69508
7
- arize/version.py,sha256=Wv8B6KxzS2ThGtkzs_13OkvwSugf5HITHYMQsGk1gjg,25
7
+ arize/version.py,sha256=YFPzyK5jfODAbvUqUQHeQ5WVmHl6zTh9HSFOA75S0rc,25
8
8
  arize/_exporter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  arize/_exporter/client.py,sha256=eAxJX1sUfdpLrtaQ0ynMTd5jI37JOp9fbl3NWp4WFEA,15216
10
10
  arize/_exporter/validation.py,sha256=6ROu5p7uaolxQ93lO_Eiwv9NVw_uyi3E5T--C5Klo5Q,1021
@@ -59,6 +59,15 @@ arize/constants/model_mapping.json,sha256=OPE54rBATzmwRhx0tycsxnGae1jBhtqEmQqQvz
59
59
  arize/constants/spans.py,sha256=EfMgbEIK_2EUcvUY5BGnNAbS7bupBKePlI3j2L5T5CE,2532
60
60
  arize/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
61
  arize/datasets/client.py,sha256=Rk3TQF2IzJwi3JqF1GYt1tUs68gPIiVFRgKjEmY7igE,743
62
+ arize/embeddings/__init__.py,sha256=6_C8908W_qDixkoBJl1wapgmQCzI8TPLH207kzbYsFA,156
63
+ arize/embeddings/auto_generator.py,sha256=ukZUJWRkiG9HFgSHXhr44rt2tdVHn1phb7_nOxYXWEg,4111
64
+ arize/embeddings/base_generators.py,sha256=HybEUAzeESswEDmkmvPayzFab1y8deg5X20HSphGp8Q,8855
65
+ arize/embeddings/constants.py,sha256=77LEXcXr_MPGRVSE06-4opFGeYrtdMmosQX91yQu6p0,1104
66
+ arize/embeddings/cv_generators.py,sha256=8eXwvP_kvAt8I9WA-0tRJd0XID4lFOydyTYfOMW_-xo,880
67
+ arize/embeddings/errors.py,sha256=T8PTFELs-xs7GXDmx402T_-DCkCXkV1CxdKAc2jAM2s,1517
68
+ arize/embeddings/nlp_generators.py,sha256=AVUpr95nQChVGAUiruCoME8tcrh79PaRrbKI7H1gGBE,3843
69
+ arize/embeddings/tabular_generators.py,sha256=lj2wVmJTfqjrziDI6Z-EEQzdwSZOml2G8PN1O4Zo5SA,5970
70
+ arize/embeddings/usecases.py,sha256=czoj5xk_WyYBsc9LE79JtkMbMN4RfKilwlm8pxl3Q_8,442
62
71
  arize/exceptions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
72
  arize/exceptions/auth.py,sha256=5hy7hhvgRCnqPACBTfG_0OICmcd9OlHQLHGhLysA6mA,403
64
73
  arize/exceptions/base.py,sha256=TWdtMulMi1Cg6X8nne_nlg8DY0zmLHb-hW9AbvjMGOs,3261
@@ -109,7 +118,9 @@ arize/utils/arrow.py,sha256=4In1gQc0i4Rb8zuwI0w-Hv-10wiItu5opqqGrJ8tSzo,5277
109
118
  arize/utils/casting.py,sha256=KUrPUQN6qJEVe39nxbr0T-0GjAJLHjf4xWuzV71QezI,12468
110
119
  arize/utils/dataframe.py,sha256=I0FloPgNiqlKga32tMOvTE70598QA8Hhrgf-6zjYMAM,1120
111
120
  arize/utils/proto.py,sha256=9vLo53INYjdF78ffjm3E48jFwK6LbPD2FfKei7VaDy8,35477
112
- arize-8.0.0a10.dist-info/METADATA,sha256=9u9UPm9jOeZp9pxLo9R5mDYvrACrOzbPET51mNyyXQU,12567
113
- arize-8.0.0a10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
114
- arize-8.0.0a10.dist-info/licenses/LICENSE.md,sha256=8vLN8Gms62NCBorxIv9MUvuK7myueb6_-dhXHPmm4H0,1479
115
- arize-8.0.0a10.dist-info/RECORD,,
121
+ arize/utils/online_tasks/__init__.py,sha256=nDuTLUTYnZaWgyJoYR1P7O8ZKA-Nba7X6tJ9OislbWM,144
122
+ arize/utils/online_tasks/dataframe_preprocessor.py,sha256=YyeeeFu_FwCYImbYvBZvQIH_5TK2lHru8KSfqV893ps,8884
123
+ arize-8.0.0a11.dist-info/METADATA,sha256=8VQP8JDh48Lbj07BqrntHaJjRdlalM7a5Zq3pv5s7E0,13842
124
+ arize-8.0.0a11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
125
+ arize-8.0.0a11.dist-info/licenses/LICENSE.md,sha256=8vLN8Gms62NCBorxIv9MUvuK7myueb6_-dhXHPmm4H0,1479
126
+ arize-8.0.0a11.dist-info/RECORD,,