validmind 2.1.1__py3-none-any.whl → 2.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai.py +72 -49
  3. validmind/api_client.py +42 -16
  4. validmind/client.py +68 -25
  5. validmind/datasets/llm/rag/__init__.py +11 -0
  6. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
  7. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
  8. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
  9. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
  10. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
  11. validmind/datasets/llm/rag/rfp.py +41 -0
  12. validmind/errors.py +1 -1
  13. validmind/html_templates/__init__.py +0 -0
  14. validmind/html_templates/content_blocks.py +89 -14
  15. validmind/models/__init__.py +7 -4
  16. validmind/models/foundation.py +8 -34
  17. validmind/models/function.py +51 -0
  18. validmind/models/huggingface.py +16 -46
  19. validmind/models/metadata.py +42 -0
  20. validmind/models/pipeline.py +66 -0
  21. validmind/models/pytorch.py +8 -42
  22. validmind/models/r_model.py +33 -82
  23. validmind/models/sklearn.py +39 -38
  24. validmind/template.py +8 -26
  25. validmind/tests/__init__.py +43 -20
  26. validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
  27. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
  28. validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
  29. validmind/tests/data_validation/Duplicates.py +1 -1
  30. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  31. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
  32. validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
  33. validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
  34. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
  35. validmind/tests/data_validation/nlp/Punctuations.py +11 -12
  36. validmind/tests/data_validation/nlp/Sentiment.py +57 -0
  37. validmind/tests/data_validation/nlp/Toxicity.py +45 -0
  38. validmind/tests/decorator.py +12 -7
  39. validmind/tests/model_validation/BertScore.py +100 -98
  40. validmind/tests/model_validation/BleuScore.py +93 -64
  41. validmind/tests/model_validation/ContextualRecall.py +74 -91
  42. validmind/tests/model_validation/MeteorScore.py +86 -74
  43. validmind/tests/model_validation/RegardScore.py +103 -121
  44. validmind/tests/model_validation/RougeScore.py +118 -0
  45. validmind/tests/model_validation/TokenDisparity.py +84 -121
  46. validmind/tests/model_validation/ToxicityScore.py +109 -123
  47. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
  48. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
  49. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
  50. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
  51. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
  52. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
  53. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
  54. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
  55. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
  56. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
  57. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
  58. validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
  59. validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
  60. validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
  61. validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
  62. validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
  63. validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
  64. validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
  65. validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
  66. validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
  67. validmind/tests/model_validation/ragas/utils.py +66 -0
  68. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
  69. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
  70. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
  71. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
  72. validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
  73. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  74. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
  75. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -11
  76. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
  77. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
  78. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
  79. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  80. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  81. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  82. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
  83. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  84. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
  85. validmind/unit_metrics/__init__.py +26 -49
  86. validmind/unit_metrics/composite.py +13 -7
  87. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
  88. validmind/utils.py +99 -6
  89. validmind/vm_models/__init__.py +1 -1
  90. validmind/vm_models/dataset/__init__.py +7 -0
  91. validmind/vm_models/dataset/dataset.py +560 -0
  92. validmind/vm_models/dataset/utils.py +146 -0
  93. validmind/vm_models/model.py +97 -72
  94. validmind/vm_models/test/metric.py +9 -24
  95. validmind/vm_models/test/result_wrapper.py +124 -28
  96. validmind/vm_models/test/threshold_test.py +10 -28
  97. validmind/vm_models/test_context.py +1 -1
  98. validmind/vm_models/test_suite/summary.py +3 -4
  99. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/METADATA +5 -3
  100. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/RECORD +103 -78
  101. validmind/models/catboost.py +0 -33
  102. validmind/models/statsmodels.py +0 -50
  103. validmind/models/xgboost.py +0 -30
  104. validmind/tests/model_validation/BertScoreAggregate.py +0 -90
  105. validmind/tests/model_validation/RegardHistogram.py +0 -148
  106. validmind/tests/model_validation/RougeMetrics.py +0 -147
  107. validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
  108. validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
  109. validmind/tests/model_validation/ToxicityHistogram.py +0 -136
  110. validmind/vm_models/dataset.py +0 -1303
  111. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/LICENSE +0 -0
  112. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/WHEEL +0 -0
  113. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,560 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ """
6
+ Dataset class wrapper
7
+ """
8
+
9
+ import warnings
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ import polars as pl
14
+
15
+ from validmind.logging import get_logger
16
+ from validmind.models import FunctionModel, PipelineModel
17
+ from validmind.vm_models.model import VMModel
18
+
19
+ from .utils import ExtraColumns, as_df, compute_predictions, convert_index_to_datetime
20
+
21
+ logger = get_logger(__name__)
22
+
23
+
24
+ class VMDataset:
25
+ """Base class for VM datasets
26
+
27
+ Child classes should be used to support new dataset types (tensor, polars etc)
28
+ by converting the user's dataset into a numpy array collecting metadata like
29
+ column names and then call this (parent) class `__init__` method.
30
+
31
+ This way we can support multiple dataset types but under the hood we only
32
+ need to work with numpy arrays and pandas dataframes in this class.
33
+
34
+ Attributes:
35
+ raw_dataset (np.ndarray): The raw dataset as a NumPy array.
36
+ input_id (str): Identifier for the dataset.
37
+ index (np.ndarray): The raw dataset index as a NumPy array.
38
+ columns (Set[str]): The column names of the dataset.
39
+ target_column (str): The target column name of the dataset.
40
+ feature_columns (List[str]): The feature column names of the dataset.
41
+ feature_columns_numeric (List[str]): The numeric feature column names of the dataset.
42
+ feature_columns_categorical (List[str]): The categorical feature column names of the dataset.
43
+ text_column (str): The text column name of the dataset for NLP tasks.
44
+ target_class_labels (Dict): The class labels for the target columns.
45
+ df (pd.DataFrame): The dataset as a pandas DataFrame.
46
+ extra_columns (Dict): Extra columns to include in the dataset.
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ raw_dataset: np.ndarray,
52
+ input_id: str = None,
53
+ model: VMModel = None,
54
+ index: np.ndarray = None,
55
+ index_name: str = None,
56
+ date_time_index: bool = False,
57
+ columns: list = None,
58
+ target_column: str = None,
59
+ feature_columns: list = None,
60
+ text_column: str = None,
61
+ extra_columns: dict = None,
62
+ target_class_labels: dict = None,
63
+ options: dict = None,
64
+ ):
65
+ """
66
+ Initializes a VMDataset instance.
67
+
68
+ Args:
69
+ raw_dataset (np.ndarray): The raw dataset as a NumPy array.
70
+ input_id (str): Identifier for the dataset.
71
+ model (VMModel): Model associated with the dataset.
72
+ index (np.ndarray): The raw dataset index as a NumPy array.
73
+ index_name (str): The raw dataset index name as a NumPy array.
74
+ date_time_index (bool): Whether the index is a datetime index.
75
+ columns (List[str], optional): The column names of the dataset. Defaults to None.
76
+ target_column (str, optional): The target column name of the dataset. Defaults to None.
77
+ feature_columns (str, optional): The feature column names of the dataset. Defaults to None.
78
+ text_column (str, optional): The text column name of the dataset for nlp tasks. Defaults to None.
79
+ target_class_labels (Dict, optional): The class labels for the target columns. Defaults to None.
80
+ options (Dict, optional): Additional options for the dataset. Defaults to None.
81
+ """
82
+ # initialize input_id
83
+ self.input_id = input_id
84
+
85
+ # initialize raw dataset
86
+ if not isinstance(raw_dataset, np.ndarray):
87
+ raise ValueError("Expected Numpy array for attribute raw_dataset")
88
+ self._raw_dataset = raw_dataset
89
+
90
+ # initialize index and index name
91
+ if index is not None and not isinstance(index, np.ndarray):
92
+ raise ValueError("Expected Numpy array for attribute raw_dataset")
93
+ self.index = index
94
+
95
+ self.df = pd.DataFrame(self._raw_dataset, columns=columns).infer_objects()
96
+ # set index to dataframe
97
+ if index is not None:
98
+ self.df.set_index(pd.Index(index), inplace=True)
99
+ self.df.index.name = index_name
100
+ # attempt to convert index to datatime
101
+ if date_time_index:
102
+ self.df = convert_index_to_datetime(self.df)
103
+
104
+ self.options = options
105
+
106
+ self.columns = columns or []
107
+ self.column_aliases = {}
108
+ self.target_column = target_column
109
+ self.text_column = text_column
110
+ self.target_class_labels = target_class_labels
111
+ self.extra_columns = ExtraColumns.from_dict(extra_columns)
112
+ self._set_feature_columns(feature_columns)
113
+
114
+ if model:
115
+ self.assign_predictions(model)
116
+
117
+ def _set_feature_columns(self, feature_columns=None):
118
+ if feature_columns is not None and (
119
+ not isinstance(feature_columns, list)
120
+ or not all(isinstance(col, str) for col in feature_columns)
121
+ ):
122
+ raise ValueError("Expected list of column names for `feature_columns`")
123
+
124
+ if feature_columns:
125
+ self.feature_columns = feature_columns
126
+ else:
127
+ excluded = [self.target_column, *self.extra_columns.flatten()]
128
+ self.feature_columns = [col for col in self.columns if col not in excluded]
129
+
130
+ self.feature_columns_numeric = (
131
+ self.df[self.feature_columns]
132
+ .select_dtypes(include=[np.number])
133
+ .columns.tolist()
134
+ )
135
+ self.feature_columns_categorical = (
136
+ self.df[self.feature_columns]
137
+ .select_dtypes(include=[object, pd.Categorical])
138
+ .columns.tolist()
139
+ )
140
+
141
+ def _add_column(self, column_name, column_values):
142
+ if len(column_values) != len(self.df):
143
+ raise ValueError(
144
+ "Length of values doesn't match number of rows in the DataFrame."
145
+ )
146
+
147
+ self.columns.append(column_name)
148
+ self.df[column_name] = column_values
149
+
150
+ def _validate_assign_predictions(
151
+ self,
152
+ model: VMModel,
153
+ prediction_column: str,
154
+ prediction_values: list,
155
+ probability_column: str,
156
+ probability_values: list,
157
+ ):
158
+ if not isinstance(model, VMModel):
159
+ raise ValueError("Expected VMModel instance for argument `model`")
160
+
161
+ if prediction_column and prediction_values is not None:
162
+ raise ValueError(
163
+ "Only one of the following arguments can be provided: "
164
+ "`prediction_column`, `prediction_values`"
165
+ )
166
+
167
+ if probability_column and probability_values is not None:
168
+ raise ValueError(
169
+ "Only one of the following arguments can be provided: "
170
+ "`probability_column`, `probability_values`"
171
+ )
172
+
173
+ if prediction_column and prediction_column not in self.columns:
174
+ raise ValueError(
175
+ f"Prediction column {prediction_column} doesn't exist in the dataset"
176
+ )
177
+
178
+ if probability_column and probability_column not in self.columns:
179
+ raise ValueError(
180
+ f"Probability column {probability_column} doesn't exist in the dataset"
181
+ )
182
+
183
+ if (probability_column or probability_values is not None) and (
184
+ not prediction_column and prediction_values is None
185
+ ):
186
+ raise ValueError(
187
+ "Cannot use precomputed probabilities without precomputed predictions"
188
+ )
189
+
190
+ def assign_predictions(
191
+ self,
192
+ model: VMModel,
193
+ prediction_column: str = None,
194
+ prediction_values: list = None,
195
+ probability_column: str = None,
196
+ probability_values: list = None,
197
+ prediction_probabilities: list = None, # DEPRECATED: use probability_values
198
+ ):
199
+ if prediction_probabilities is not None:
200
+ warnings.warn(
201
+ "The `prediction_probabilities` argument is deprecated. Use `probability_values` instead.",
202
+ DeprecationWarning,
203
+ )
204
+ probability_values = prediction_probabilities
205
+
206
+ self._validate_assign_predictions(
207
+ model,
208
+ prediction_column,
209
+ prediction_values,
210
+ probability_column,
211
+ probability_values,
212
+ )
213
+
214
+ if self.prediction_column(model):
215
+ logger.warning("Model predictions already assigned... Overwriting.")
216
+
217
+ if self.probability_column(model):
218
+ logger.warning("Model probabilities already assigned... Overwriting.")
219
+
220
+ # if the user passes a column name, we assume it has precomputed predictions
221
+ if prediction_column:
222
+ prediction_values = self.df[prediction_column].values
223
+
224
+ if probability_column:
225
+ probability_values = self.df[probability_column].values
226
+
227
+ if prediction_values is None:
228
+ X = self.df if isinstance(model, (FunctionModel, PipelineModel)) else self.x
229
+ probability_values, prediction_values = compute_predictions(model, X)
230
+
231
+ prediction_column = prediction_column or f"{model.input_id}_prediction"
232
+ self._add_column(prediction_column, prediction_values)
233
+ self.prediction_column(model, prediction_column)
234
+
235
+ if probability_values is not None:
236
+ probability_column = probability_column or f"{model.input_id}_probabilities"
237
+ self._add_column(probability_column, probability_values)
238
+ self.probability_column(model, probability_column)
239
+ else:
240
+ logger.info(
241
+ "No probabilities computed or provided. "
242
+ "Not adding probability column to the dataset."
243
+ )
244
+
245
+ def prediction_column(self, model: VMModel, column_name: str = None) -> str:
246
+ """Get or set the prediction column for a model."""
247
+ if column_name and column_name not in self.columns:
248
+ raise ValueError("{column_name} doesn't exist in the dataset")
249
+
250
+ if column_name and column_name in self.feature_columns:
251
+ self.feature_columns.remove(column_name)
252
+
253
+ return self.extra_columns.prediction_column(model, column_name)
254
+
255
+ def probability_column(self, model: VMModel, column_name: str = None) -> str:
256
+ """Get or set the probability column for a model."""
257
+ if column_name and column_name not in self.columns:
258
+ raise ValueError("{column_name} doesn't exist in the dataset")
259
+
260
+ if column_name and column_name in self.feature_columns:
261
+ self.feature_columns.remove(column_name)
262
+
263
+ return self.extra_columns.probability_column(model, column_name)
264
+
265
+ def add_extra_column(self, column_name, column_values=None):
266
+ """Adds an extra column to the dataset without modifying the dataset `features` and `target` columns.
267
+
268
+ Args:
269
+ column_name (str): The name of the extra column.
270
+ column_values (np.ndarray, optional): The values of the extra column.
271
+ """
272
+ if column_name not in self.columns and (
273
+ column_values is None or len(column_values) == 0
274
+ ):
275
+ raise ValueError(
276
+ "Column values must be provided when the column doesn't exist in the dataset"
277
+ )
278
+
279
+ # some warnings to let the user know what's happening
280
+ if column_name in self.extra_columns:
281
+ logger.warning(f"{column_name} is already an extra column. Overwriting...")
282
+ elif column_name in self.columns and column_values:
283
+ logger.warning(
284
+ f"{column_name} already exists in the dataset but `column_values` were passed. Overwriting..."
285
+ )
286
+
287
+ self.extra_columns.extras.add(column_name)
288
+ self._add_column(column_name, column_values)
289
+
290
+ # reset feature columns to exclude the new extra column
291
+ self._set_feature_columns()
292
+
293
+ logger.info(
294
+ f"Extra column {column_name} with {len(column_values)} values added to the dataset"
295
+ )
296
+
297
+ @property
298
+ def x(self) -> np.ndarray:
299
+ """
300
+ Returns the input features (X) of the dataset.
301
+
302
+ Returns:
303
+ np.ndarray: The input features.
304
+ """
305
+ return self.df[self.feature_columns].to_numpy()
306
+
307
+ @property
308
+ def y(self) -> np.ndarray:
309
+ """
310
+ Returns the target variables (y) of the dataset.
311
+
312
+ Returns:
313
+ np.ndarray: The target variables.
314
+ """
315
+ return self.df[self.target_column].to_numpy()
316
+
317
+ def y_pred(self, model) -> np.ndarray:
318
+ """Returns the predictions for a given model.
319
+
320
+ Attempts to stack complex prediction types (e.g., embeddings) into a single,
321
+ multi-dimensional array.
322
+
323
+ Args:
324
+ model (VMModel): The model whose predictions are sought.
325
+
326
+ Returns:
327
+ np.ndarray: The predictions for the model
328
+ """
329
+ return np.stack(self.df[self.prediction_column(model)].values)
330
+
331
+ def y_prob(self, model) -> np.ndarray:
332
+ """Returns the probabilities for a given model.
333
+
334
+ Args:
335
+ model (str): The ID of the model whose predictions are sought.
336
+
337
+ Returns:
338
+ np.ndarray: The probability variables.
339
+ """
340
+ return self.df[self.probability_column(model)].values
341
+
342
+ def x_df(self):
343
+ """Returns a dataframe containing only the feature columns"""
344
+ return as_df(self.df[self.feature_columns])
345
+
346
+ def y_df(self) -> pd.DataFrame:
347
+ """Returns a dataframe containing the target column"""
348
+ return as_df(self.df[self.target_column])
349
+
350
+ def y_pred_df(self, model) -> pd.DataFrame:
351
+ """Returns a dataframe containing the predictions for a given model"""
352
+ return as_df(self.df[self.prediction_column(model)])
353
+
354
+ def y_prob_df(self, model) -> pd.DataFrame:
355
+ """Returns a dataframe containing the probabilities for a given model"""
356
+ return as_df(self.df[self.probability_column(model)])
357
+
358
+ def target_classes(self):
359
+ """Returns the unique number of target classes for the target (Y) variable"""
360
+ return [str(i) for i in np.unique(self.y)]
361
+
362
+ def __str__(self):
363
+ return (
364
+ f"=================\n"
365
+ f"VMDataset object: \n"
366
+ f"=================\n"
367
+ f"Input ID: {self.input_id}\n"
368
+ f"Target Column: {self.target_column}\n"
369
+ f"Feature Columns: {self.feature_columns}\n"
370
+ f"Text Column: {self.text_column}\n"
371
+ f"Extra Columns: {self.extra_columns}\n"
372
+ f"Target Class Labels: {self.target_class_labels}\n"
373
+ f"Columns: {self.columns}\n"
374
+ f"Index: {self.index}\n"
375
+ f"=================\n"
376
+ )
377
+
378
+
379
+ class DataFrameDataset(VMDataset):
380
+ """
381
+ VM dataset implementation for pandas DataFrame.
382
+ """
383
+
384
+ def __init__(
385
+ self,
386
+ raw_dataset: pd.DataFrame,
387
+ input_id: str = None,
388
+ model: VMModel = None,
389
+ target_column: str = None,
390
+ extra_columns: dict = None,
391
+ feature_columns: list = None,
392
+ text_column: str = None,
393
+ target_class_labels: dict = None,
394
+ options: dict = None,
395
+ date_time_index: bool = False,
396
+ ):
397
+ """
398
+ Initializes a DataFrameDataset instance.
399
+
400
+ Args:
401
+ raw_dataset (pd.DataFrame): The raw dataset as a pandas DataFrame.
402
+ input_id (str, optional): Identifier for the dataset. Defaults to None.
403
+ model (VMModel, optional): Model associated with the dataset. Defaults to None.
404
+ target_column (str, optional): The target column of the dataset. Defaults to None.
405
+ extra_columns (dict, optional): Extra columns to include in the dataset. Defaults to None.
406
+ feature_columns (list, optional): The feature columns of the dataset. Defaults to None.
407
+ text_column (str, optional): The text column name of the dataset for NLP tasks. Defaults to None.
408
+ target_class_labels (dict, optional): The class labels for the target columns. Defaults to None.
409
+ options (dict, optional): Additional options for the dataset. Defaults to None.
410
+ date_time_index (bool, optional): Whether to use date-time index. Defaults to False.
411
+ """
412
+ index = None
413
+ if isinstance(raw_dataset.index, pd.Index):
414
+ index = raw_dataset.index.values
415
+
416
+ super().__init__(
417
+ raw_dataset=raw_dataset.values,
418
+ input_id=input_id,
419
+ model=model,
420
+ index_name=raw_dataset.index.name,
421
+ index=index,
422
+ columns=raw_dataset.columns.to_list(),
423
+ target_column=target_column,
424
+ extra_columns=extra_columns,
425
+ feature_columns=feature_columns,
426
+ text_column=text_column,
427
+ target_class_labels=target_class_labels,
428
+ options=options,
429
+ date_time_index=date_time_index,
430
+ )
431
+
432
+
433
+ class PolarsDataset(VMDataset):
434
+ """
435
+ VM dataset implementation for Polars DataFrame.
436
+ """
437
+
438
+ def __init__(
439
+ self,
440
+ raw_dataset: pl.DataFrame,
441
+ input_id: str = None,
442
+ model: VMModel = None,
443
+ target_column: str = None,
444
+ extra_columns: dict = None,
445
+ feature_columns: list = None,
446
+ text_column: str = None,
447
+ target_class_labels: dict = None,
448
+ options: dict = None,
449
+ date_time_index: bool = False,
450
+ ):
451
+ """
452
+ Initializes a PolarsDataset instance.
453
+
454
+ Args:
455
+ raw_dataset (pl.DataFrame): The raw dataset as a Polars DataFrame.
456
+ input_id (str, optional): Identifier for the dataset. Defaults to None.
457
+ model (VMModel, optional): Model associated with the dataset. Defaults to None.
458
+ target_column (str, optional): The target column of the dataset. Defaults to None.
459
+ extra_columns (dict, optional): Extra columns to include in the dataset. Defaults to None.
460
+ feature_columns (list, optional): The feature columns of the dataset. Defaults to None.
461
+ text_column (str, optional): The text column name of the dataset for NLP tasks. Defaults to None.
462
+ target_class_labels (dict, optional): The class labels for the target columns. Defaults to None.
463
+ options (dict, optional): Additional options for the dataset. Defaults to None.
464
+ date_time_index (bool, optional): Whether to use date-time index. Defaults to False.
465
+ """
466
+ super().__init__(
467
+ raw_dataset=raw_dataset.to_numpy(),
468
+ input_id=input_id,
469
+ model=model,
470
+ index_name=None,
471
+ index=None,
472
+ columns=raw_dataset.columns,
473
+ target_column=target_column,
474
+ extra_columns=extra_columns,
475
+ feature_columns=feature_columns,
476
+ text_column=text_column,
477
+ target_class_labels=target_class_labels,
478
+ options=options,
479
+ date_time_index=date_time_index,
480
+ )
481
+
482
+
483
+ class TorchDataset(VMDataset):
484
+ """
485
+ VM dataset implementation for PyTorch Datasets.
486
+ """
487
+
488
+ def __init__(
489
+ self,
490
+ raw_dataset,
491
+ input_id: str = None,
492
+ model: VMModel = None,
493
+ index_name=None,
494
+ index=None,
495
+ columns=None,
496
+ target_column: str = None,
497
+ extra_columns: dict = None,
498
+ feature_columns: list = None,
499
+ text_column: str = None,
500
+ target_class_labels: dict = None,
501
+ options: dict = None,
502
+ ):
503
+ """
504
+ Initializes a TorchDataset instance.
505
+
506
+ Args:
507
+ raw_dataset (torch.utils.data.Dataset): The raw dataset as a PyTorch Dataset.
508
+ index_name (str): The raw dataset index name.
509
+ index (np.ndarray): The raw dataset index as a NumPy array.
510
+ columns (List[str]): The column names of the dataset.
511
+ target_column (str, optional): The target column of the dataset. Defaults to None.
512
+ feature_columns (list, optional): The feature columns of the dataset. Defaults to None.
513
+ text_column (str, optional): The text column name of the dataset for nlp tasks. Defaults to None.
514
+ target_class_labels (Dict, optional): The class labels for the target columns. Defaults to None.
515
+ """
516
+
517
+ try:
518
+ import torch
519
+ except ImportError:
520
+ raise ImportError(
521
+ "PyTorch is not installed, please run `pip install validmind[pytorch]`"
522
+ )
523
+
524
+ columns = []
525
+
526
+ for id, tens in zip(range(0, len(raw_dataset.tensors)), raw_dataset.tensors):
527
+ if id == 0 and feature_columns is None:
528
+ n_cols = tens.shape[1]
529
+ feature_columns = [
530
+ "x" + feature_id
531
+ for feature_id in np.linspace(
532
+ 0, n_cols - 1, num=n_cols, dtype=int
533
+ ).astype(str)
534
+ ]
535
+ columns.append(feature_columns)
536
+
537
+ elif id == 1 and target_column is None:
538
+ target_column = "y"
539
+ columns.append(target_column)
540
+
541
+ elif id == 2 and extra_columns is None:
542
+ extra_columns.prediction_column = "y_pred"
543
+ columns.append(extra_columns.prediction_column)
544
+
545
+ merged_tensors = torch.cat(raw_dataset.tensors, dim=1).numpy()
546
+
547
+ super().__init__(
548
+ input_id=input_id,
549
+ raw_dataset=merged_tensors,
550
+ model=model,
551
+ index_name=index_name,
552
+ index=index,
553
+ columns=columns,
554
+ target_column=target_column,
555
+ feature_columns=feature_columns,
556
+ text_column=text_column,
557
+ extra_columns=extra_columns,
558
+ target_class_labels=target_class_labels,
559
+ options=options,
560
+ )
@@ -0,0 +1,146 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Dict, List, Set, Union
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ from validmind.errors import MissingOrInvalidModelPredictFnError
12
+ from validmind.logging import get_logger
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class ExtraColumns:
19
+ """Extra columns for the dataset."""
20
+
21
+ extras: Set[str] = field(default_factory=set)
22
+ group_by_column: str = None
23
+ prediction_columns: Dict[str, str] = field(default_factory=dict)
24
+ probability_columns: Dict[str, str] = field(default_factory=dict)
25
+
26
+ @classmethod
27
+ def from_dict(cls, data: dict):
28
+ if not data:
29
+ return cls()
30
+
31
+ return cls(
32
+ extras=set(
33
+ [
34
+ k
35
+ for k in data.keys()
36
+ if k not in ["group_by", "predictions", "probabilities"]
37
+ ]
38
+ ),
39
+ group_by_column=data.get("group_by"),
40
+ prediction_columns=data.get("predictions", {}),
41
+ probability_columns=data.get("probabilities", {}),
42
+ )
43
+
44
+ def __contains__(self, key):
45
+ """Allow checking if a key is `in` the extra columns"""
46
+ return key in self.flatten()
47
+
48
+ def flatten(self) -> List[str]:
49
+ """Get a list of all column names"""
50
+ return [
51
+ self.group_by_column,
52
+ *self.extras,
53
+ *self.prediction_columns.values(),
54
+ *self.probability_columns.values(),
55
+ ]
56
+
57
+ def add_extra(self, column_name: str) -> str:
58
+ self.extras.add(column_name)
59
+
60
+ return column_name
61
+
62
+ def prediction_column(self, model, column_name: str = None):
63
+ """Get or set the prediction column for a model."""
64
+ if column_name:
65
+ self.prediction_columns[model.input_id] = column_name
66
+
67
+ return self.prediction_columns.get(model.input_id)
68
+
69
+ def probability_column(self, model, column_name: str = None):
70
+ """Get or set the probability column for a model."""
71
+ if column_name:
72
+ self.probability_columns[model.input_id] = column_name
73
+
74
+ return self.probability_columns.get(model.input_id)
75
+
76
+
77
+ def as_df(series_or_frame: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame:
78
+ if isinstance(series_or_frame, pd.Series):
79
+ return series_or_frame.to_frame()
80
+ return series_or_frame
81
+
82
+
83
+ def _is_probabilties(output):
84
+ """Check if the output from the predict method is probabilities."""
85
+ if not isinstance(output, np.ndarray) or output.ndim > 1:
86
+ return False
87
+
88
+ # This is a simple check that assumes output is probabilities if they lie between 0 and 1
89
+ if np.all((output >= 0) & (output <= 1)):
90
+ # Check if there is at least one element that is neither 0 nor 1
91
+ if np.any((output > 0) & (output < 1)):
92
+ return True
93
+
94
+ return np.all((output >= 0) & (output <= 1)) and np.any((output > 0) & (output < 1))
95
+
96
+
97
+ def compute_predictions(model, X) -> tuple:
98
+ probability_values = None
99
+
100
+ try:
101
+ logger.info("Running predict_proba()... This may take a while")
102
+ probability_values = model.predict_proba(X)
103
+ logger.info("Done running predict_proba()")
104
+ except MissingOrInvalidModelPredictFnError:
105
+ # if not predict_proba() then its likely a regression model or a classification
106
+ # model that doesn't support predict_proba()
107
+ logger.info("Not running predict_proba() for unsupported models.")
108
+
109
+ try:
110
+ logger.info("Running predict()... This may take a while")
111
+ prediction_values = model.predict(X)
112
+ logger.info("Done running predict()")
113
+ except MissingOrInvalidModelPredictFnError:
114
+ raise MissingOrInvalidModelPredictFnError(
115
+ "Cannot compute predictions for model's that don't support inference. "
116
+ "You can pass `prediction_values` or `prediction_columns` to use precomputed predictions"
117
+ )
118
+
119
+ # TODO: this is really not ideal/robust and should not be handled by dataset class
120
+ if probability_values is None and _is_probabilties(prediction_values):
121
+ logger.info(
122
+ "Predict method returned probabilities instead of direct labels or regression values. "
123
+ "This implies the model is likely configured for a classification task with probability output."
124
+ )
125
+ threshold = 0.5
126
+ logger.info(
127
+ f"Converting probabilties to binary classes using thresholding with `{threshold=}`."
128
+ )
129
+
130
+ return prediction_values, (prediction_values > threshold).astype(int)
131
+
132
+ return probability_values, prediction_values
133
+
134
+
135
+ def convert_index_to_datetime(df):
136
+ """
137
+ Attempts to convert the index of the dataset to a datetime index
138
+ and leaves the index unchanged if it fails.
139
+ """
140
+ converted_index = pd.to_datetime(df.index, errors="coerce")
141
+
142
+ # The conversion was successful if there are no NaT values
143
+ if not converted_index.isnull().any():
144
+ df.index = converted_index
145
+
146
+ return df