validmind 2.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. validmind/__init__.py +6 -3
  2. validmind/__version__.py +1 -1
  3. validmind/ai.py +193 -0
  4. validmind/api_client.py +45 -31
  5. validmind/client.py +33 -6
  6. validmind/datasets/classification/customer_churn.py +2 -2
  7. validmind/datasets/credit_risk/__init__.py +11 -0
  8. validmind/datasets/credit_risk/datasets/lending_club_loan_data_2007_2014_clean.csv.gz +0 -0
  9. validmind/datasets/credit_risk/lending_club.py +394 -0
  10. validmind/datasets/nlp/__init__.py +5 -0
  11. validmind/datasets/nlp/cnn_dailymail.py +98 -0
  12. validmind/datasets/nlp/datasets/cnn_dailymail_100_with_predictions.csv +255 -0
  13. validmind/datasets/nlp/datasets/cnn_dailymail_500_with_predictions.csv +1277 -0
  14. validmind/datasets/nlp/datasets/sentiments_with_predictions.csv +4847 -0
  15. validmind/errors.py +11 -1
  16. validmind/logging.py +9 -2
  17. validmind/models/huggingface.py +2 -2
  18. validmind/models/pytorch.py +3 -3
  19. validmind/models/sklearn.py +4 -4
  20. validmind/template.py +2 -2
  21. validmind/test_suites/__init__.py +4 -2
  22. validmind/tests/__init__.py +130 -45
  23. validmind/tests/data_validation/DatasetDescription.py +0 -1
  24. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +3 -1
  25. validmind/tests/data_validation/PiTCreditScoresHistogram.py +1 -1
  26. validmind/tests/data_validation/ScatterPlot.py +8 -2
  27. validmind/tests/data_validation/nlp/StopWords.py +1 -6
  28. validmind/tests/data_validation/nlp/TextDescription.py +20 -9
  29. validmind/tests/decorator.py +313 -0
  30. validmind/tests/model_validation/BertScore.py +1 -1
  31. validmind/tests/model_validation/BertScoreAggregate.py +1 -1
  32. validmind/tests/model_validation/BleuScore.py +1 -1
  33. validmind/tests/model_validation/ClusterSizeDistribution.py +1 -1
  34. validmind/tests/model_validation/ContextualRecall.py +1 -1
  35. validmind/tests/model_validation/FeaturesAUC.py +110 -0
  36. validmind/tests/model_validation/MeteorScore.py +92 -0
  37. validmind/tests/model_validation/RegardHistogram.py +6 -7
  38. validmind/tests/model_validation/RegardScore.py +4 -6
  39. validmind/tests/model_validation/RegressionResidualsPlot.py +127 -0
  40. validmind/tests/model_validation/RougeMetrics.py +7 -5
  41. validmind/tests/model_validation/RougeMetricsAggregate.py +1 -1
  42. validmind/tests/model_validation/SelfCheckNLIScore.py +112 -0
  43. validmind/tests/model_validation/TokenDisparity.py +1 -1
  44. validmind/tests/model_validation/ToxicityHistogram.py +1 -1
  45. validmind/tests/model_validation/ToxicityScore.py +1 -1
  46. validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
  47. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +1 -3
  48. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +17 -22
  49. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +1 -1
  50. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +16 -17
  51. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +1 -1
  52. validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
  53. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +21 -3
  54. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +1 -1
  55. validmind/tests/model_validation/sklearn/MinimumF1Score.py +1 -1
  56. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +1 -1
  57. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +5 -4
  58. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +2 -2
  59. validmind/tests/model_validation/sklearn/ROCCurve.py +6 -12
  60. validmind/tests/model_validation/sklearn/RegressionErrors.py +2 -2
  61. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +6 -4
  62. validmind/tests/model_validation/sklearn/RegressionR2Square.py +2 -2
  63. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +55 -5
  64. validmind/tests/model_validation/sklearn/SilhouettePlot.py +1 -1
  65. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +11 -5
  66. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +2 -2
  67. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +140 -0
  68. validmind/tests/model_validation/statsmodels/GINITable.py +22 -45
  69. validmind/tests/model_validation/statsmodels/{LogisticRegPredictionHistogram.py → PredictionProbabilitiesHistogram.py} +67 -92
  70. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +2 -2
  71. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -2
  72. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  73. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  74. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  75. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +2 -2
  76. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +128 -0
  77. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +70 -103
  78. validmind/tests/prompt_validation/ai_powered_test.py +2 -0
  79. validmind/tests/test_providers.py +14 -124
  80. validmind/unit_metrics/__init__.py +75 -70
  81. validmind/unit_metrics/classification/sklearn/Accuracy.py +14 -0
  82. validmind/unit_metrics/classification/sklearn/F1.py +13 -0
  83. validmind/unit_metrics/classification/sklearn/Precision.py +13 -0
  84. validmind/unit_metrics/classification/sklearn/ROC_AUC.py +13 -0
  85. validmind/unit_metrics/classification/sklearn/Recall.py +13 -0
  86. validmind/unit_metrics/composite.py +228 -0
  87. validmind/unit_metrics/regression/GiniCoefficient.py +33 -0
  88. validmind/unit_metrics/regression/HuberLoss.py +23 -0
  89. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +30 -0
  90. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +16 -0
  91. validmind/unit_metrics/regression/MeanBiasDeviation.py +13 -0
  92. validmind/unit_metrics/regression/QuantileLoss.py +15 -0
  93. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +21 -0
  94. validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +13 -0
  95. validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +13 -0
  96. validmind/unit_metrics/regression/sklearn/RSquaredScore.py +13 -0
  97. validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +20 -0
  98. validmind/utils.py +20 -31
  99. validmind/vm_models/__init__.py +0 -2
  100. validmind/vm_models/dataset.py +623 -29
  101. validmind/vm_models/figure.py +52 -17
  102. validmind/vm_models/test/metric.py +33 -31
  103. validmind/vm_models/test/output_template.py +0 -27
  104. validmind/vm_models/test/result_wrapper.py +68 -36
  105. validmind/vm_models/test/test.py +4 -2
  106. validmind/vm_models/test/threshold_test.py +24 -14
  107. validmind/vm_models/test_context.py +7 -0
  108. validmind/vm_models/test_suite/runner.py +1 -1
  109. validmind/vm_models/test_suite/summary.py +3 -3
  110. validmind/vm_models/test_suite/test.py +1 -1
  111. validmind/vm_models/test_suite/test_suite.py +2 -1
  112. {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/METADATA +18 -18
  113. {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/RECORD +116 -94
  114. validmind-2.1.0.dist-info/entry_points.txt +3 -0
  115. validmind/tests/__types__.py +0 -62
  116. validmind/tests/model_validation/statsmodels/LogRegressionConfusionMatrix.py +0 -128
  117. validmind/tests/model_validation/statsmodels/LogisticRegCumulativeProb.py +0 -172
  118. validmind/tests/model_validation/statsmodels/ScorecardBucketHistogram.py +0 -181
  119. validmind/tests/model_validation/statsmodels/ScorecardProbabilitiesHistogram.py +0 -175
  120. validmind/unit_metrics/sklearn/classification/Accuracy.py +0 -20
  121. validmind/unit_metrics/sklearn/classification/F1.py +0 -22
  122. validmind/unit_metrics/sklearn/classification/Precision.py +0 -22
  123. validmind/unit_metrics/sklearn/classification/ROC_AUC.py +0 -20
  124. validmind/unit_metrics/sklearn/classification/Recall.py +0 -20
  125. validmind/vm_models/test/unit_metric.py +0 -88
  126. {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/LICENSE +0 -0
  127. {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/WHEEL +0 -0
@@ -6,12 +6,15 @@
6
6
  Dataset class wrapper
7
7
  """
8
8
 
9
+ import warnings
9
10
  from abc import ABC, abstractmethod
10
11
  from dataclasses import dataclass, field
11
12
 
12
13
  import numpy as np
13
14
  import pandas as pd
15
+ import polars as pl
14
16
 
17
+ from validmind.errors import MissingOrInvalidModelPredictFnError
15
18
  from validmind.logging import get_logger
16
19
  from validmind.vm_models.model import VMModel
17
20
 
@@ -35,9 +38,246 @@ class VMDataset(ABC):
35
38
  pass
36
39
 
37
40
  @abstractmethod
38
- def serialize(self):
41
+ def assign_predictions(
42
+ self,
43
+ model,
44
+ prediction_values: list = None,
45
+ prediction_probabilities: list = None,
46
+ prediction_column=None,
47
+ probability_column=None,
48
+ ):
39
49
  """
40
- Serializes the dataset to a dictionary.
50
+ Assigns predictions to the dataset for a given model or prediction values.
51
+ The dataset is updated with a new column containing the predictions.
52
+ """
53
+ pass
54
+
55
+ @abstractmethod
56
+ def get_extra_column(self, column_name):
57
+ """
58
+ Returns the values of the specified extra column.
59
+
60
+ Args:
61
+ column_name (str): The name of the extra column.
62
+
63
+ Returns:
64
+ np.ndarray: The values of the extra column.
65
+ """
66
+ pass
67
+
68
+ @abstractmethod
69
+ def add_extra_column(self, column_name, column_values=None):
70
+ """
71
+ Adds an extra column to the dataset without modifying the dataset `features` and `target` columns.
72
+
73
+ Args:
74
+ column_name (str): The name of the extra column.
75
+ column_values (np.ndarray, optional): The values of the extra column.
76
+ """
77
+ pass
78
+
79
+ @property
80
+ @abstractmethod
81
+ def input_id(self) -> str:
82
+ """
83
+ Returns input id of dataset.
84
+
85
+ Returns:
86
+ str: input_id.
87
+ """
88
+ return self.input_id
89
+
90
+ @property
91
+ @abstractmethod
92
+ def columns(self) -> list:
93
+ """
94
+ Returns the the list of columns in the dataset.
95
+
96
+ Returns:
97
+ List[str]: The columns list.
98
+ """
99
+ pass
100
+
101
+ @property
102
+ @abstractmethod
103
+ def target_column(self) -> str:
104
+ """
105
+ Returns the target column name of the dataset.
106
+
107
+ Returns:
108
+ str: The target column name.
109
+ """
110
+ pass
111
+
112
+ @property
113
+ @abstractmethod
114
+ def feature_columns(self) -> list:
115
+ """
116
+ Returns the feature columns of the dataset. If _feature_columns is None,
117
+ it returns all columns except the target column.
118
+
119
+ Returns:
120
+ list: The list of feature column names.
121
+ """
122
+ pass
123
+
124
+ @property
125
+ @abstractmethod
126
+ def text_column(self) -> str:
127
+ """
128
+ Returns the text column of the dataset.
129
+
130
+ Returns:
131
+ str: The text column name.
132
+ """
133
+ pass
134
+
135
+ @property
136
+ @abstractmethod
137
+ def x(self) -> np.ndarray:
138
+ """
139
+ Returns the input features (X) of the dataset.
140
+
141
+ Returns:
142
+ np.ndarray: The input features.
143
+ """
144
+ pass
145
+
146
+ @property
147
+ @abstractmethod
148
+ def y(self) -> np.ndarray:
149
+ """
150
+ Returns the target variables (y) of the dataset.
151
+
152
+ Returns:
153
+ np.ndarray: The target variables.
154
+ """
155
+ pass
156
+
157
+ @abstractmethod
158
+ def y_pred(self, model) -> np.ndarray:
159
+ """
160
+ Returns the prediction values (y_pred) of the dataset for a given model.
161
+
162
+ Returns:
163
+ np.ndarray: The prediction values.
164
+ """
165
+ pass
166
+
167
+ def y_prob(self, model) -> np.ndarray:
168
+ """
169
+ Returns the prediction probabilities (y_prob) of the dataset for a given model.
170
+
171
+ Returns:
172
+ np.ndarray: The prediction probabilities.
173
+ """
174
+ pass
175
+
176
+ @property
177
+ @abstractmethod
178
+ def df(self):
179
+ """
180
+ Returns the dataset as a pandas DataFrame.
181
+
182
+ Returns:
183
+ pd.DataFrame: The dataset as a DataFrame.
184
+ """
185
+ pass
186
+
187
+ @property
188
+ @abstractmethod
189
+ def copy(self):
190
+ """
191
+ Returns a copy of the raw_dataset dataframe.
192
+ """
193
+ pass
194
+
195
+ @abstractmethod
196
+ def x_df(self):
197
+ """
198
+ Returns the non target and prediction columns.
199
+
200
+ Returns:
201
+ pd.DataFrame: The non target and prediction columns .
202
+ """
203
+ pass
204
+
205
+ @abstractmethod
206
+ def y_df(self):
207
+ """
208
+ Returns the target columns (y) of the dataset.
209
+
210
+ Returns:
211
+ pd.DataFrame: The target columns.
212
+ """
213
+ pass
214
+
215
+ @abstractmethod
216
+ def y_pred_df(self, model):
217
+ """
218
+ Returns the target columns (y) of the dataset.
219
+
220
+ Returns:
221
+ pd.DataFrame: The target columns.
222
+ """
223
+ pass
224
+
225
+ @abstractmethod
226
+ def y_prob_df(self, model):
227
+ """
228
+ Returns the target columns (y) of the dataset.
229
+
230
+ Returns:
231
+ pd.DataFrame: The target columns.
232
+ """
233
+ pass
234
+
235
+ @abstractmethod
236
+ def prediction_column(self, model) -> str:
237
+ """
238
+ Returns the prediction column name of the dataset.
239
+
240
+ Returns:
241
+ str: The prediction column name.
242
+ """
243
+ pass
244
+
245
+ def probability_column(self, model) -> str:
246
+ """
247
+ Returns the probability column name of the dataset.
248
+
249
+ Returns:
250
+ str: The probability column name.
251
+ """
252
+ pass
253
+
254
+ @abstractmethod
255
+ def get_features_columns(self):
256
+ """
257
+ Returns the column names of the feature variables.
258
+
259
+ Returns:
260
+ List[str]: The column names of the feature variables.
261
+ """
262
+ pass
263
+
264
+ @abstractmethod
265
+ def get_numeric_features_columns(self):
266
+ """
267
+ Returns the column names of the numeric feature variables.
268
+
269
+ Returns:
270
+ List[str]: The column names of the numeric feature variables.
271
+ """
272
+ pass
273
+
274
+ @abstractmethod
275
+ def get_categorical_features_columns(self):
276
+ """
277
+ Returns the column names of the categorical feature variables.
278
+
279
+ Returns:
280
+ List[str]: The column names of the categorical feature variables.
41
281
  """
42
282
  pass
43
283
 
@@ -62,6 +302,7 @@ class NumpyDataset(VMDataset):
62
302
  _extra_columns: dict = field(
63
303
  default_factory=lambda: {
64
304
  "prediction_columns": {},
305
+ "probability_columns": {},
65
306
  "group_by_column": None,
66
307
  }
67
308
  )
@@ -134,7 +375,7 @@ class NumpyDataset(VMDataset):
134
375
  # initialize target column
135
376
  self._target_column = target_column
136
377
  # initialize extra columns
137
- self.__set_extra_columns(extra_columns, model)
378
+ self.__set_extra_columns(extra_columns)
138
379
  # initialize feature columns
139
380
  self.__set_feature_columns(feature_columns)
140
381
  # initialize text column, target class labels and options
@@ -144,10 +385,11 @@ class NumpyDataset(VMDataset):
144
385
  if model:
145
386
  self.assign_predictions(model)
146
387
 
147
- def __set_extra_columns(self, extra_columns, model):
388
+ def __set_extra_columns(self, extra_columns):
148
389
  if extra_columns is None:
149
390
  extra_columns = {
150
391
  "prediction_columns": {},
392
+ "probability_columns": {},
151
393
  "group_by_column": None,
152
394
  }
153
395
  self._extra_columns = extra_columns
@@ -187,30 +429,91 @@ class NumpyDataset(VMDataset):
187
429
 
188
430
  return df
189
431
 
432
+ def __model_id_in_probability_columns(self, model, probability_column):
433
+ return model.input_id in self._extra_columns.get("probability_columns", {})
434
+
190
435
  def __model_id_in_prediction_columns(self, model, prediction_column):
191
436
  return model.input_id in self._extra_columns.get("prediction_columns", {})
192
437
 
193
438
  def __assign_prediction_values(self, model, pred_column, prediction_values):
439
+ # Link the prediction column with the model
194
440
  self._extra_columns.setdefault("prediction_columns", {})[
195
441
  model.input_id
196
442
  ] = pred_column
197
- self._raw_dataset = np.hstack(
198
- (self._raw_dataset, np.array(prediction_values).reshape(-1, 1))
443
+
444
+ # Check if the predictions are multi-dimensional (e.g., embeddings)
445
+ is_multi_dimensional = (
446
+ isinstance(prediction_values, np.ndarray) and prediction_values.ndim > 1
199
447
  )
200
- self._columns.append(pred_column)
201
- self._df[pred_column] = prediction_values
448
+
449
+ if is_multi_dimensional:
450
+ # For multi-dimensional outputs, convert to a list of lists to store in DataFrame
451
+ self._df[pred_column] = list(map(list, prediction_values))
452
+ else:
453
+ # If not multi-dimensional or a standard numpy array, reshape for compatibility
454
+ self._raw_dataset = np.hstack(
455
+ (self._raw_dataset, np.array(prediction_values).reshape(-1, 1))
456
+ )
457
+ self._df[pred_column] = prediction_values
458
+
459
+ # Update the dataset columns list
460
+ if pred_column not in self._columns:
461
+ self._columns.append(pred_column)
462
+
463
+ def __assign_prediction_probabilities(
464
+ self, model, prob_column, prediction_probabilities
465
+ ):
466
+ # Link the prediction column with the model
467
+ self._extra_columns.setdefault("probability_columns", {})[
468
+ model.input_id
469
+ ] = prob_column
470
+
471
+ # Check if the predictions are multi-dimensional (e.g., embeddings)
472
+ is_multi_dimensional = (
473
+ isinstance(prediction_probabilities, np.ndarray)
474
+ and prediction_probabilities.ndim > 1
475
+ )
476
+
477
+ if is_multi_dimensional:
478
+ # For multi-dimensional outputs, convert to a list of lists to store in DataFrame
479
+ self._df[prob_column] = list(map(list, prediction_probabilities))
480
+ else:
481
+ # If not multi-dimensional or a standard numpy array, reshape for compatibility
482
+ self._raw_dataset = np.hstack(
483
+ (self._raw_dataset, np.array(prediction_probabilities).reshape(-1, 1))
484
+ )
485
+ self._df[prob_column] = prediction_probabilities
486
+
487
+ # Update the dataset columns list
488
+ if prob_column not in self._columns:
489
+ self._columns.append(prob_column)
202
490
 
203
491
  def assign_predictions( # noqa: C901 - we need to simplify this method
204
492
  self,
205
493
  model,
206
494
  prediction_values: list = None,
495
+ prediction_probabilities: list = None,
207
496
  prediction_column=None,
497
+ probability_column=None,
208
498
  ):
499
+ def _is_probability(output):
500
+ """Check if the output from the predict method is probabilities."""
501
+ # This is a simple check that assumes output is probabilities if they lie between 0 and 1
502
+ if np.all((output >= 0) & (output <= 1)):
503
+ # Check if there is at least one element that is neither 0 nor 1
504
+ if np.any((output > 0) & (output < 1)):
505
+ return True
506
+ return np.all((output >= 0) & (output <= 1)) and np.any(
507
+ (output > 0) & (output < 1)
508
+ )
509
+
510
+ # Step 1: Check for Model Presence
209
511
  if not model:
210
512
  raise ValueError(
211
513
  "Model must be provided to link prediction column with the dataset"
212
514
  )
213
515
 
516
+ # Step 2: Prediction Column Provided
214
517
  if prediction_column:
215
518
  if prediction_column not in self.columns:
216
519
  raise ValueError(
@@ -225,6 +528,8 @@ class NumpyDataset(VMDataset):
225
528
  self._extra_columns.setdefault("prediction_columns", {})[
226
529
  model.input_id
227
530
  ] = prediction_column
531
+
532
+ # Step 4: Prediction Values Provided without Specific Column
228
533
  elif prediction_values is not None:
229
534
  if len(prediction_values) != self.df.shape[0]:
230
535
  raise ValueError(
@@ -232,13 +537,58 @@ class NumpyDataset(VMDataset):
232
537
  )
233
538
  pred_column = f"{model.input_id}_prediction"
234
539
  if pred_column in self.columns:
235
- raise ValueError(
236
- f"Prediction column {pred_column} already exists in the dataset"
540
+ warnings.warn(
541
+ f"Prediction column {pred_column} already exists in the dataset, overwriting the existing predictions",
542
+ UserWarning,
237
543
  )
544
+
545
+ logger.info(
546
+ f"Assigning prediction values to column '{pred_column}' and linked to model '{model.input_id}'"
547
+ )
238
548
  self.__assign_prediction_values(model, pred_column, prediction_values)
549
+
550
+ # Step 3: Probability Column Provided
551
+ if probability_column:
552
+ if probability_column not in self.columns:
553
+ raise ValueError(
554
+ f"Probability column {probability_column} doesn't exist in the dataset"
555
+ )
556
+ if self.__model_id_in_probability_columns(
557
+ model=model, probability_column=probability_column
558
+ ):
559
+ raise ValueError(
560
+ f"Probability column {probability_column} already linked to the VM model"
561
+ )
562
+ self._extra_columns.setdefault("probability_columns", {})[
563
+ model.input_id
564
+ ] = probability_column
565
+
566
+ # Step 5: Prediction Probabilities Provided without Specific Column
567
+ elif prediction_probabilities is not None:
568
+ if len(prediction_probabilities) != self.df.shape[0]:
569
+ raise ValueError(
570
+ "Length of prediction probabilities doesn't match number of rows of the dataset"
571
+ )
572
+ prob_column = f"{model.input_id}_probabilities"
573
+ if prob_column in self.columns:
574
+ warnings.warn(
575
+ f"Probability column {prob_column} already exists in the dataset, overwriting the existing probabilities",
576
+ UserWarning,
577
+ )
578
+
579
+ logger.info(
580
+ f"Assigning prediction probabilities to column '{prob_column}' and linked to model '{model.input_id}'"
581
+ )
582
+ self.__assign_prediction_probabilities(
583
+ model, prob_column, prediction_probabilities
584
+ )
585
+
586
+ # Step 6: Neither Specific Column Nor Values Provided
239
587
  elif not self.__model_id_in_prediction_columns(
240
588
  model=model, prediction_column=prediction_column
241
589
  ):
590
+
591
+ # Compute prediction values directly from the VM model
242
592
  pred_column = f"{model.input_id}_prediction"
243
593
  if pred_column in self.columns:
244
594
  logger.info(
@@ -256,12 +606,107 @@ class NumpyDataset(VMDataset):
256
606
  )
257
607
 
258
608
  prediction_values = np.array(model.predict(x_only))
259
- self.__assign_prediction_values(model, pred_column, prediction_values)
609
+
610
+ # Check if the prediction values are probabilities
611
+ if _is_probability(prediction_values):
612
+
613
+ threshold = 0.5
614
+
615
+ logger.info(
616
+ "Predict method returned probabilities instead of direct labels or regression values. "
617
+ + "This implies the model is likely configured for a classification task with probability output."
618
+ )
619
+ prob_column = f"{model.input_id}_probabilities"
620
+ logger.info(
621
+ f"Assigning probabilities to column '{prob_column}' and computing class labels using a threshold of {threshold}."
622
+ )
623
+ self.__assign_prediction_probabilities(
624
+ model, prob_column, prediction_values
625
+ )
626
+
627
+ # Convert probabilities to class labels based on the threshold
628
+ prediction_classes = (prediction_values > threshold).astype(int)
629
+ self.__assign_prediction_values(model, pred_column, prediction_classes)
630
+
631
+ else:
632
+
633
+ # If not assign the prediction values directly
634
+ pred_column = f"{model.input_id}_prediction"
635
+ self.__assign_prediction_values(model, pred_column, prediction_values)
636
+
637
+ try:
638
+ logger.info("Running predict_proba()... This may take a while")
639
+ prediction_probabilities = np.array(model.predict_proba(x_only))
640
+ prob_column = f"{model.input_id}_probabilities"
641
+ self.__assign_prediction_probabilities(
642
+ model, prob_column, prediction_probabilities
643
+ )
644
+ except MissingOrInvalidModelPredictFnError:
645
+ # Log that predict_proba is not available or failed
646
+ logger.warn(
647
+ f"Model class '{model.__class__}' does not have a compatible predict_proba implementation."
648
+ + " Please assign predictions directly with vm_dataset.assign_predictions(model, prediction_values)"
649
+ )
650
+
651
+ # Step 7: Prediction Column Already Linked
260
652
  else:
261
653
  logger.info(
262
654
  f"Prediction column {self._extra_columns['prediction_columns'][model.input_id]} already linked to the {model.input_id}"
263
655
  )
264
656
 
657
+ def get_extra_column(self, column_name):
658
+ """
659
+ Returns the values of the specified extra column.
660
+
661
+ Args:
662
+ column_name (str): The name of the extra column.
663
+
664
+ Returns:
665
+ np.ndarray: The values of the extra column.
666
+ """
667
+ if column_name not in self.extra_columns:
668
+ raise ValueError(f"Column {column_name} is not an extra column")
669
+
670
+ return self._df[column_name]
671
+
672
+ def add_extra_column(self, column_name, column_values=None):
673
+ """
674
+ Adds an extra column to the dataset without modifying the dataset `features` and `target` columns.
675
+
676
+ Args:
677
+ column_name (str): The name of the extra column.
678
+ column_values (np.ndarray, optional): The values of the extra column.
679
+ """
680
+ if column_name in self.extra_columns:
681
+ logger.info(f"Column {column_name} already registered as an extra column")
682
+ return
683
+
684
+ # The column name already exists in the dataset so we just assign the extra column
685
+ if column_name in self.columns:
686
+ self._extra_columns[column_name] = column_name
687
+ logger.info(
688
+ f"Column {column_name} exists in the dataset, registering as an extra column"
689
+ )
690
+ return
691
+
692
+ if column_values is None:
693
+ raise ValueError(
694
+ "Column values must be provided when the column doesn't exist in the dataset"
695
+ )
696
+
697
+ if len(column_values) != self.df.shape[0]:
698
+ raise ValueError(
699
+ "Length of column values doesn't match number of rows of the dataset"
700
+ )
701
+
702
+ self._raw_dataset = np.hstack(
703
+ (self._raw_dataset, np.array(column_values).reshape(-1, 1))
704
+ )
705
+ self._columns.append(column_name)
706
+ self._df[column_name] = column_values
707
+ self._extra_columns[column_name] = column_name
708
+ logger.info(f"Column {column_name} added as an extra column")
709
+
265
710
  @property
266
711
  def raw_dataset(self) -> np.ndarray:
267
712
  """
@@ -397,21 +842,83 @@ class NumpyDataset(VMDataset):
397
842
  ],
398
843
  ]
399
844
 
400
- def y_pred(self, model_id) -> np.ndarray:
845
+ def y_pred(self, model) -> np.ndarray:
401
846
  """
402
- Returns the prediction variable (y_pred) of the dataset.
847
+ Returns the prediction variables for a given model, accommodating
848
+ both scalar predictions and multi-dimensional outputs such as embeddings.
849
+
850
+ Args:
851
+ model (VMModel): The model whose predictions are sought.
403
852
 
404
853
  Returns:
405
- np.ndarray: The prediction variables.
854
+ np.ndarray: The prediction variables, either as a flattened array for
855
+ scalar predictions or as an array of arrays for multi-dimensional outputs.
856
+ """
857
+ pred_column = self.prediction_column(model)
858
+
859
+ # First, attempt to retrieve the prediction data from the DataFrame
860
+ if hasattr(self, "_df") and pred_column in self._df.columns:
861
+ predictions = self._df[pred_column].to_numpy()
862
+
863
+ # Check if the predictions are stored as objects (e.g., lists for embeddings)
864
+ if self._df[pred_column].dtype == object:
865
+ # Attempt to convert lists to a numpy array
866
+ try:
867
+ predictions = np.stack(predictions)
868
+ except ValueError as e:
869
+ # Handling cases where predictions cannot be directly stacked
870
+ raise ValueError(f"Error stacking prediction arrays: {e}")
871
+ else:
872
+ # Fallback to using the raw numpy dataset if DataFrame is not available or suitable
873
+ try:
874
+ predictions = self.raw_dataset[
875
+ :, self.columns.index(pred_column)
876
+ ].flatten()
877
+ except IndexError as e:
878
+ raise ValueError(
879
+ f"Prediction column '{pred_column}' not found in raw dataset: {e}"
880
+ )
881
+
882
+ return predictions
883
+
884
+ def y_prob(self, model) -> np.ndarray:
406
885
  """
407
- return self.raw_dataset[
408
- :,
409
- [
410
- self.columns.index(name)
411
- for name in self.columns
412
- if name == self.prediction_column(model_id=model_id)
413
- ],
414
- ].flatten()
886
+ Returns the prediction variables for a given model, accommodating
887
+ both scalar predictions and multi-dimensional outputs such as embeddings.
888
+
889
+ Args:
890
+ model (str): The ID of the model whose predictions are sought.
891
+
892
+ Returns:
893
+ np.ndarray: The prediction variables, either as a flattened array for
894
+ scalar predictions or as an array of arrays for multi-dimensional outputs.
895
+ """
896
+ prob_column = self.probability_column(model)
897
+
898
+ # First, attempt to retrieve the prediction data from the DataFrame
899
+ if hasattr(self, "_df") and prob_column in self._df.columns:
900
+ probabilities = self._df[prob_column].to_numpy()
901
+
902
+ # Check if the predictions are stored as objects (e.g., lists for embeddings)
903
+ if self._df[prob_column].dtype == object:
904
+ # Attempt to convert lists to a numpy array
905
+ try:
906
+ probabilities = np.stack(probabilities)
907
+ except ValueError as e:
908
+ # Handling cases where predictions cannot be directly stacked
909
+ raise ValueError(f"Error stacking prediction arrays: {e}")
910
+ else:
911
+ # Fallback to using the raw numpy dataset if DataFrame is not available or suitable
912
+ try:
913
+ probabilities = self.raw_dataset[
914
+ :, self.columns.index(prob_column)
915
+ ].flatten()
916
+ except IndexError as e:
917
+ raise ValueError(
918
+ f"Prediction column '{prob_column}' not found in raw dataset: {e}"
919
+ )
920
+
921
+ return probabilities
415
922
 
416
923
  @property
417
924
  def type(self) -> str:
@@ -458,22 +965,32 @@ class NumpyDataset(VMDataset):
458
965
  """
459
966
  return self._df[self.target_column]
460
967
 
461
- def y_pred_df(self, model_id):
968
+ def y_pred_df(self, model):
462
969
  """
463
970
  Returns the target columns (y) of the dataset.
464
971
 
465
972
  Returns:
466
973
  pd.DataFrame: The target columns.
467
974
  """
468
- return self._df[self.prediction_column(model_id=model_id)]
975
+ return self._df[self.prediction_column(model)]
469
976
 
470
- def prediction_column(self, model_id) -> str:
977
+ def y_prob_df(self, model):
978
+ """
979
+ Returns the target columns (y) of the dataset.
980
+
981
+ Returns:
982
+ pd.DataFrame: The target columns.
983
+ """
984
+ return self._df[self.probability_column(model)]
985
+
986
+ def prediction_column(self, model) -> str:
471
987
  """
472
988
  Returns the prediction column name of the dataset.
473
989
 
474
990
  Returns:
475
991
  str: The prediction column name.
476
992
  """
993
+ model_id = model.input_id
477
994
  pred_column = self._extra_columns.get("prediction_columns", {}).get(model_id)
478
995
  if pred_column is None:
479
996
  raise ValueError(
@@ -481,6 +998,21 @@ class NumpyDataset(VMDataset):
481
998
  )
482
999
  return pred_column
483
1000
 
1001
+ def probability_column(self, model) -> str:
1002
+ """
1003
+ Returns the prediction column name of the dataset.
1004
+
1005
+ Returns:
1006
+ str: The prediction column name.
1007
+ """
1008
+ model_id = model.input_id
1009
+ prob_column = self._extra_columns.get("probability_columns", {}).get(model_id)
1010
+ if prob_column is None:
1011
+ raise ValueError(
1012
+ f"Probability column is not linked with the given {model_id}"
1013
+ )
1014
+ return prob_column
1015
+
484
1016
  def serialize(self):
485
1017
  """
486
1018
  Serializes the dataset to a dictionary.
@@ -608,10 +1140,15 @@ class DataFrameDataset(NumpyDataset):
608
1140
 
609
1141
  Args:
610
1142
  raw_dataset (pd.DataFrame): The raw dataset as a pandas DataFrame.
1143
+ input_id (str, optional): Identifier for the dataset. Defaults to None.
1144
+ model (VMModel, optional): Model associated with the dataset. Defaults to None.
611
1145
  target_column (str, optional): The target column of the dataset. Defaults to None.
1146
+ extra_columns (dict, optional): Extra columns to include in the dataset. Defaults to None.
612
1147
  feature_columns (list, optional): The feature columns of the dataset. Defaults to None.
613
- text_column (str, optional): The text column name of the dataset for nlp tasks. Defaults to None.
614
- target_class_labels (Dict, optional): The class labels for the target columns. Defaults to None.
1148
+ text_column (str, optional): The text column name of the dataset for NLP tasks. Defaults to None.
1149
+ target_class_labels (dict, optional): The class labels for the target columns. Defaults to None.
1150
+ options (dict, optional): Additional options for the dataset. Defaults to None.
1151
+ date_time_index (bool, optional): Whether to use date-time index. Defaults to False.
615
1152
  """
616
1153
  index = None
617
1154
  if isinstance(raw_dataset.index, pd.Index):
@@ -634,6 +1171,57 @@ class DataFrameDataset(NumpyDataset):
634
1171
  )
635
1172
 
636
1173
 
1174
+ @dataclass
1175
+ class PolarsDataset(NumpyDataset):
1176
+ """
1177
+ VM dataset implementation for Polars DataFrame.
1178
+ """
1179
+
1180
+ def __init__(
1181
+ self,
1182
+ raw_dataset: pl.DataFrame,
1183
+ input_id: str = None,
1184
+ model: VMModel = None,
1185
+ target_column: str = None,
1186
+ extra_columns: dict = None,
1187
+ feature_columns: list = None,
1188
+ text_column: str = None,
1189
+ target_class_labels: dict = None,
1190
+ options: dict = None,
1191
+ date_time_index: bool = False,
1192
+ ):
1193
+ """
1194
+ Initializes a PolarsDataset instance.
1195
+
1196
+ Args:
1197
+ raw_dataset (pl.DataFrame): The raw dataset as a Polars DataFrame.
1198
+ input_id (str, optional): Identifier for the dataset. Defaults to None.
1199
+ model (VMModel, optional): Model associated with the dataset. Defaults to None.
1200
+ target_column (str, optional): The target column of the dataset. Defaults to None.
1201
+ extra_columns (dict, optional): Extra columns to include in the dataset. Defaults to None.
1202
+ feature_columns (list, optional): The feature columns of the dataset. Defaults to None.
1203
+ text_column (str, optional): The text column name of the dataset for NLP tasks. Defaults to None.
1204
+ target_class_labels (dict, optional): The class labels for the target columns. Defaults to None.
1205
+ options (dict, optional): Additional options for the dataset. Defaults to None.
1206
+ date_time_index (bool, optional): Whether to use date-time index. Defaults to False.
1207
+ """
1208
+ super().__init__(
1209
+ raw_dataset=raw_dataset.to_numpy(),
1210
+ input_id=input_id,
1211
+ model=model,
1212
+ index_name=None,
1213
+ index=None,
1214
+ columns=raw_dataset.columns,
1215
+ target_column=target_column,
1216
+ extra_columns=extra_columns,
1217
+ feature_columns=feature_columns,
1218
+ text_column=text_column,
1219
+ target_class_labels=target_class_labels,
1220
+ options=options,
1221
+ date_time_index=date_time_index,
1222
+ )
1223
+
1224
+
637
1225
  @dataclass
638
1226
  class TorchDataset(NumpyDataset):
639
1227
  """
@@ -668,12 +1256,16 @@ class TorchDataset(NumpyDataset):
668
1256
  text_column (str, optional): The text column name of the dataset for nlp tasks. Defaults to None.
669
1257
  target_class_labels (Dict, optional): The class labels for the target columns. Defaults to None.
670
1258
  """
671
- # if we can't import torch, then it's not a PyTorch model
1259
+
672
1260
  try:
673
1261
  import torch
674
1262
  except ImportError:
675
- return False
1263
+ raise ImportError(
1264
+ "PyTorch is not installed, please run `pip install validmind[pytorch]`"
1265
+ )
1266
+
676
1267
  columns = []
1268
+
677
1269
  for id, tens in zip(range(0, len(raw_dataset.tensors)), raw_dataset.tensors):
678
1270
  if id == 0 and feature_columns is None:
679
1271
  n_cols = tens.shape[1]
@@ -684,9 +1276,11 @@ class TorchDataset(NumpyDataset):
684
1276
  ).astype(str)
685
1277
  ]
686
1278
  columns.append(feature_columns)
1279
+
687
1280
  elif id == 1 and target_column is None:
688
1281
  target_column = "y"
689
1282
  columns.append(target_column)
1283
+
690
1284
  elif id == 2 and extra_columns is None:
691
1285
  extra_columns.prediction_column = "y_pred"
692
1286
  columns.append(extra_columns.prediction_column)