validmind 2.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +6 -3
- validmind/__version__.py +1 -1
- validmind/ai.py +193 -0
- validmind/api_client.py +45 -31
- validmind/client.py +33 -6
- validmind/datasets/classification/customer_churn.py +2 -2
- validmind/datasets/credit_risk/__init__.py +11 -0
- validmind/datasets/credit_risk/datasets/lending_club_loan_data_2007_2014_clean.csv.gz +0 -0
- validmind/datasets/credit_risk/lending_club.py +394 -0
- validmind/datasets/nlp/__init__.py +5 -0
- validmind/datasets/nlp/cnn_dailymail.py +98 -0
- validmind/datasets/nlp/datasets/cnn_dailymail_100_with_predictions.csv +255 -0
- validmind/datasets/nlp/datasets/cnn_dailymail_500_with_predictions.csv +1277 -0
- validmind/datasets/nlp/datasets/sentiments_with_predictions.csv +4847 -0
- validmind/errors.py +11 -1
- validmind/logging.py +9 -2
- validmind/models/huggingface.py +2 -2
- validmind/models/pytorch.py +3 -3
- validmind/models/sklearn.py +4 -4
- validmind/template.py +2 -2
- validmind/test_suites/__init__.py +4 -2
- validmind/tests/__init__.py +130 -45
- validmind/tests/data_validation/DatasetDescription.py +0 -1
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +3 -1
- validmind/tests/data_validation/PiTCreditScoresHistogram.py +1 -1
- validmind/tests/data_validation/ScatterPlot.py +8 -2
- validmind/tests/data_validation/nlp/StopWords.py +1 -6
- validmind/tests/data_validation/nlp/TextDescription.py +20 -9
- validmind/tests/decorator.py +313 -0
- validmind/tests/model_validation/BertScore.py +1 -1
- validmind/tests/model_validation/BertScoreAggregate.py +1 -1
- validmind/tests/model_validation/BleuScore.py +1 -1
- validmind/tests/model_validation/ClusterSizeDistribution.py +1 -1
- validmind/tests/model_validation/ContextualRecall.py +1 -1
- validmind/tests/model_validation/FeaturesAUC.py +110 -0
- validmind/tests/model_validation/MeteorScore.py +92 -0
- validmind/tests/model_validation/RegardHistogram.py +6 -7
- validmind/tests/model_validation/RegardScore.py +4 -6
- validmind/tests/model_validation/RegressionResidualsPlot.py +127 -0
- validmind/tests/model_validation/RougeMetrics.py +7 -5
- validmind/tests/model_validation/RougeMetricsAggregate.py +1 -1
- validmind/tests/model_validation/SelfCheckNLIScore.py +112 -0
- validmind/tests/model_validation/TokenDisparity.py +1 -1
- validmind/tests/model_validation/ToxicityHistogram.py +1 -1
- validmind/tests/model_validation/ToxicityScore.py +1 -1
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +1 -3
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +17 -22
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +1 -1
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +16 -17
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +1 -1
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +21 -3
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +1 -1
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +1 -1
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +1 -1
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +5 -4
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +2 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +6 -12
- validmind/tests/model_validation/sklearn/RegressionErrors.py +2 -2
- validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +6 -4
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +2 -2
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +55 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +1 -1
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +11 -5
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +2 -2
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +140 -0
- validmind/tests/model_validation/statsmodels/GINITable.py +22 -45
- validmind/tests/model_validation/statsmodels/{LogisticRegPredictionHistogram.py → PredictionProbabilitiesHistogram.py} +67 -92
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +2 -2
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -2
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +2 -2
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +128 -0
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +70 -103
- validmind/tests/prompt_validation/ai_powered_test.py +2 -0
- validmind/tests/test_providers.py +14 -124
- validmind/unit_metrics/__init__.py +75 -70
- validmind/unit_metrics/classification/sklearn/Accuracy.py +14 -0
- validmind/unit_metrics/classification/sklearn/F1.py +13 -0
- validmind/unit_metrics/classification/sklearn/Precision.py +13 -0
- validmind/unit_metrics/classification/sklearn/ROC_AUC.py +13 -0
- validmind/unit_metrics/classification/sklearn/Recall.py +13 -0
- validmind/unit_metrics/composite.py +228 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +33 -0
- validmind/unit_metrics/regression/HuberLoss.py +23 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +30 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +16 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +13 -0
- validmind/unit_metrics/regression/QuantileLoss.py +15 -0
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +21 -0
- validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +13 -0
- validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +13 -0
- validmind/unit_metrics/regression/sklearn/RSquaredScore.py +13 -0
- validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +20 -0
- validmind/utils.py +20 -31
- validmind/vm_models/__init__.py +0 -2
- validmind/vm_models/dataset.py +623 -29
- validmind/vm_models/figure.py +52 -17
- validmind/vm_models/test/metric.py +33 -31
- validmind/vm_models/test/output_template.py +0 -27
- validmind/vm_models/test/result_wrapper.py +68 -36
- validmind/vm_models/test/test.py +4 -2
- validmind/vm_models/test/threshold_test.py +24 -14
- validmind/vm_models/test_context.py +7 -0
- validmind/vm_models/test_suite/runner.py +1 -1
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +1 -1
- validmind/vm_models/test_suite/test_suite.py +2 -1
- {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/METADATA +18 -18
- {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/RECORD +116 -94
- validmind-2.1.0.dist-info/entry_points.txt +3 -0
- validmind/tests/__types__.py +0 -62
- validmind/tests/model_validation/statsmodels/LogRegressionConfusionMatrix.py +0 -128
- validmind/tests/model_validation/statsmodels/LogisticRegCumulativeProb.py +0 -172
- validmind/tests/model_validation/statsmodels/ScorecardBucketHistogram.py +0 -181
- validmind/tests/model_validation/statsmodels/ScorecardProbabilitiesHistogram.py +0 -175
- validmind/unit_metrics/sklearn/classification/Accuracy.py +0 -20
- validmind/unit_metrics/sklearn/classification/F1.py +0 -22
- validmind/unit_metrics/sklearn/classification/Precision.py +0 -22
- validmind/unit_metrics/sklearn/classification/ROC_AUC.py +0 -20
- validmind/unit_metrics/sklearn/classification/Recall.py +0 -20
- validmind/vm_models/test/unit_metric.py +0 -88
- {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/LICENSE +0 -0
- {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/WHEEL +0 -0
validmind/vm_models/dataset.py
CHANGED
@@ -6,12 +6,15 @@
|
|
6
6
|
Dataset class wrapper
|
7
7
|
"""
|
8
8
|
|
9
|
+
import warnings
|
9
10
|
from abc import ABC, abstractmethod
|
10
11
|
from dataclasses import dataclass, field
|
11
12
|
|
12
13
|
import numpy as np
|
13
14
|
import pandas as pd
|
15
|
+
import polars as pl
|
14
16
|
|
17
|
+
from validmind.errors import MissingOrInvalidModelPredictFnError
|
15
18
|
from validmind.logging import get_logger
|
16
19
|
from validmind.vm_models.model import VMModel
|
17
20
|
|
@@ -35,9 +38,246 @@ class VMDataset(ABC):
|
|
35
38
|
pass
|
36
39
|
|
37
40
|
@abstractmethod
|
38
|
-
def
|
41
|
+
def assign_predictions(
|
42
|
+
self,
|
43
|
+
model,
|
44
|
+
prediction_values: list = None,
|
45
|
+
prediction_probabilities: list = None,
|
46
|
+
prediction_column=None,
|
47
|
+
probability_column=None,
|
48
|
+
):
|
39
49
|
"""
|
40
|
-
|
50
|
+
Assigns predictions to the dataset for a given model or prediction values.
|
51
|
+
The dataset is updated with a new column containing the predictions.
|
52
|
+
"""
|
53
|
+
pass
|
54
|
+
|
55
|
+
@abstractmethod
|
56
|
+
def get_extra_column(self, column_name):
|
57
|
+
"""
|
58
|
+
Returns the values of the specified extra column.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
column_name (str): The name of the extra column.
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
np.ndarray: The values of the extra column.
|
65
|
+
"""
|
66
|
+
pass
|
67
|
+
|
68
|
+
@abstractmethod
|
69
|
+
def add_extra_column(self, column_name, column_values=None):
|
70
|
+
"""
|
71
|
+
Adds an extra column to the dataset without modifying the dataset `features` and `target` columns.
|
72
|
+
|
73
|
+
Args:
|
74
|
+
column_name (str): The name of the extra column.
|
75
|
+
column_values (np.ndarray, optional): The values of the extra column.
|
76
|
+
"""
|
77
|
+
pass
|
78
|
+
|
79
|
+
@property
|
80
|
+
@abstractmethod
|
81
|
+
def input_id(self) -> str:
|
82
|
+
"""
|
83
|
+
Returns input id of dataset.
|
84
|
+
|
85
|
+
Returns:
|
86
|
+
str: input_id.
|
87
|
+
"""
|
88
|
+
return self.input_id
|
89
|
+
|
90
|
+
@property
|
91
|
+
@abstractmethod
|
92
|
+
def columns(self) -> list:
|
93
|
+
"""
|
94
|
+
Returns the the list of columns in the dataset.
|
95
|
+
|
96
|
+
Returns:
|
97
|
+
List[str]: The columns list.
|
98
|
+
"""
|
99
|
+
pass
|
100
|
+
|
101
|
+
@property
|
102
|
+
@abstractmethod
|
103
|
+
def target_column(self) -> str:
|
104
|
+
"""
|
105
|
+
Returns the target column name of the dataset.
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
str: The target column name.
|
109
|
+
"""
|
110
|
+
pass
|
111
|
+
|
112
|
+
@property
|
113
|
+
@abstractmethod
|
114
|
+
def feature_columns(self) -> list:
|
115
|
+
"""
|
116
|
+
Returns the feature columns of the dataset. If _feature_columns is None,
|
117
|
+
it returns all columns except the target column.
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
list: The list of feature column names.
|
121
|
+
"""
|
122
|
+
pass
|
123
|
+
|
124
|
+
@property
|
125
|
+
@abstractmethod
|
126
|
+
def text_column(self) -> str:
|
127
|
+
"""
|
128
|
+
Returns the text column of the dataset.
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
str: The text column name.
|
132
|
+
"""
|
133
|
+
pass
|
134
|
+
|
135
|
+
@property
|
136
|
+
@abstractmethod
|
137
|
+
def x(self) -> np.ndarray:
|
138
|
+
"""
|
139
|
+
Returns the input features (X) of the dataset.
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
np.ndarray: The input features.
|
143
|
+
"""
|
144
|
+
pass
|
145
|
+
|
146
|
+
@property
|
147
|
+
@abstractmethod
|
148
|
+
def y(self) -> np.ndarray:
|
149
|
+
"""
|
150
|
+
Returns the target variables (y) of the dataset.
|
151
|
+
|
152
|
+
Returns:
|
153
|
+
np.ndarray: The target variables.
|
154
|
+
"""
|
155
|
+
pass
|
156
|
+
|
157
|
+
@abstractmethod
|
158
|
+
def y_pred(self, model) -> np.ndarray:
|
159
|
+
"""
|
160
|
+
Returns the prediction values (y_pred) of the dataset for a given model.
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
np.ndarray: The prediction values.
|
164
|
+
"""
|
165
|
+
pass
|
166
|
+
|
167
|
+
def y_prob(self, model) -> np.ndarray:
|
168
|
+
"""
|
169
|
+
Returns the prediction probabilities (y_prob) of the dataset for a given model.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
np.ndarray: The prediction probabilities.
|
173
|
+
"""
|
174
|
+
pass
|
175
|
+
|
176
|
+
@property
|
177
|
+
@abstractmethod
|
178
|
+
def df(self):
|
179
|
+
"""
|
180
|
+
Returns the dataset as a pandas DataFrame.
|
181
|
+
|
182
|
+
Returns:
|
183
|
+
pd.DataFrame: The dataset as a DataFrame.
|
184
|
+
"""
|
185
|
+
pass
|
186
|
+
|
187
|
+
@property
|
188
|
+
@abstractmethod
|
189
|
+
def copy(self):
|
190
|
+
"""
|
191
|
+
Returns a copy of the raw_dataset dataframe.
|
192
|
+
"""
|
193
|
+
pass
|
194
|
+
|
195
|
+
@abstractmethod
|
196
|
+
def x_df(self):
|
197
|
+
"""
|
198
|
+
Returns the non target and prediction columns.
|
199
|
+
|
200
|
+
Returns:
|
201
|
+
pd.DataFrame: The non target and prediction columns .
|
202
|
+
"""
|
203
|
+
pass
|
204
|
+
|
205
|
+
@abstractmethod
|
206
|
+
def y_df(self):
|
207
|
+
"""
|
208
|
+
Returns the target columns (y) of the dataset.
|
209
|
+
|
210
|
+
Returns:
|
211
|
+
pd.DataFrame: The target columns.
|
212
|
+
"""
|
213
|
+
pass
|
214
|
+
|
215
|
+
@abstractmethod
|
216
|
+
def y_pred_df(self, model):
|
217
|
+
"""
|
218
|
+
Returns the target columns (y) of the dataset.
|
219
|
+
|
220
|
+
Returns:
|
221
|
+
pd.DataFrame: The target columns.
|
222
|
+
"""
|
223
|
+
pass
|
224
|
+
|
225
|
+
@abstractmethod
|
226
|
+
def y_prob_df(self, model):
|
227
|
+
"""
|
228
|
+
Returns the target columns (y) of the dataset.
|
229
|
+
|
230
|
+
Returns:
|
231
|
+
pd.DataFrame: The target columns.
|
232
|
+
"""
|
233
|
+
pass
|
234
|
+
|
235
|
+
@abstractmethod
|
236
|
+
def prediction_column(self, model) -> str:
|
237
|
+
"""
|
238
|
+
Returns the prediction column name of the dataset.
|
239
|
+
|
240
|
+
Returns:
|
241
|
+
str: The prediction column name.
|
242
|
+
"""
|
243
|
+
pass
|
244
|
+
|
245
|
+
def probability_column(self, model) -> str:
|
246
|
+
"""
|
247
|
+
Returns the probability column name of the dataset.
|
248
|
+
|
249
|
+
Returns:
|
250
|
+
str: The probability column name.
|
251
|
+
"""
|
252
|
+
pass
|
253
|
+
|
254
|
+
@abstractmethod
|
255
|
+
def get_features_columns(self):
|
256
|
+
"""
|
257
|
+
Returns the column names of the feature variables.
|
258
|
+
|
259
|
+
Returns:
|
260
|
+
List[str]: The column names of the feature variables.
|
261
|
+
"""
|
262
|
+
pass
|
263
|
+
|
264
|
+
@abstractmethod
|
265
|
+
def get_numeric_features_columns(self):
|
266
|
+
"""
|
267
|
+
Returns the column names of the numeric feature variables.
|
268
|
+
|
269
|
+
Returns:
|
270
|
+
List[str]: The column names of the numeric feature variables.
|
271
|
+
"""
|
272
|
+
pass
|
273
|
+
|
274
|
+
@abstractmethod
|
275
|
+
def get_categorical_features_columns(self):
|
276
|
+
"""
|
277
|
+
Returns the column names of the categorical feature variables.
|
278
|
+
|
279
|
+
Returns:
|
280
|
+
List[str]: The column names of the categorical feature variables.
|
41
281
|
"""
|
42
282
|
pass
|
43
283
|
|
@@ -62,6 +302,7 @@ class NumpyDataset(VMDataset):
|
|
62
302
|
_extra_columns: dict = field(
|
63
303
|
default_factory=lambda: {
|
64
304
|
"prediction_columns": {},
|
305
|
+
"probability_columns": {},
|
65
306
|
"group_by_column": None,
|
66
307
|
}
|
67
308
|
)
|
@@ -134,7 +375,7 @@ class NumpyDataset(VMDataset):
|
|
134
375
|
# initialize target column
|
135
376
|
self._target_column = target_column
|
136
377
|
# initialize extra columns
|
137
|
-
self.__set_extra_columns(extra_columns
|
378
|
+
self.__set_extra_columns(extra_columns)
|
138
379
|
# initialize feature columns
|
139
380
|
self.__set_feature_columns(feature_columns)
|
140
381
|
# initialize text column, target class labels and options
|
@@ -144,10 +385,11 @@ class NumpyDataset(VMDataset):
|
|
144
385
|
if model:
|
145
386
|
self.assign_predictions(model)
|
146
387
|
|
147
|
-
def __set_extra_columns(self, extra_columns
|
388
|
+
def __set_extra_columns(self, extra_columns):
|
148
389
|
if extra_columns is None:
|
149
390
|
extra_columns = {
|
150
391
|
"prediction_columns": {},
|
392
|
+
"probability_columns": {},
|
151
393
|
"group_by_column": None,
|
152
394
|
}
|
153
395
|
self._extra_columns = extra_columns
|
@@ -187,30 +429,91 @@ class NumpyDataset(VMDataset):
|
|
187
429
|
|
188
430
|
return df
|
189
431
|
|
432
|
+
def __model_id_in_probability_columns(self, model, probability_column):
|
433
|
+
return model.input_id in self._extra_columns.get("probability_columns", {})
|
434
|
+
|
190
435
|
def __model_id_in_prediction_columns(self, model, prediction_column):
|
191
436
|
return model.input_id in self._extra_columns.get("prediction_columns", {})
|
192
437
|
|
193
438
|
def __assign_prediction_values(self, model, pred_column, prediction_values):
|
439
|
+
# Link the prediction column with the model
|
194
440
|
self._extra_columns.setdefault("prediction_columns", {})[
|
195
441
|
model.input_id
|
196
442
|
] = pred_column
|
197
|
-
|
198
|
-
|
443
|
+
|
444
|
+
# Check if the predictions are multi-dimensional (e.g., embeddings)
|
445
|
+
is_multi_dimensional = (
|
446
|
+
isinstance(prediction_values, np.ndarray) and prediction_values.ndim > 1
|
199
447
|
)
|
200
|
-
|
201
|
-
|
448
|
+
|
449
|
+
if is_multi_dimensional:
|
450
|
+
# For multi-dimensional outputs, convert to a list of lists to store in DataFrame
|
451
|
+
self._df[pred_column] = list(map(list, prediction_values))
|
452
|
+
else:
|
453
|
+
# If not multi-dimensional or a standard numpy array, reshape for compatibility
|
454
|
+
self._raw_dataset = np.hstack(
|
455
|
+
(self._raw_dataset, np.array(prediction_values).reshape(-1, 1))
|
456
|
+
)
|
457
|
+
self._df[pred_column] = prediction_values
|
458
|
+
|
459
|
+
# Update the dataset columns list
|
460
|
+
if pred_column not in self._columns:
|
461
|
+
self._columns.append(pred_column)
|
462
|
+
|
463
|
+
def __assign_prediction_probabilities(
|
464
|
+
self, model, prob_column, prediction_probabilities
|
465
|
+
):
|
466
|
+
# Link the prediction column with the model
|
467
|
+
self._extra_columns.setdefault("probability_columns", {})[
|
468
|
+
model.input_id
|
469
|
+
] = prob_column
|
470
|
+
|
471
|
+
# Check if the predictions are multi-dimensional (e.g., embeddings)
|
472
|
+
is_multi_dimensional = (
|
473
|
+
isinstance(prediction_probabilities, np.ndarray)
|
474
|
+
and prediction_probabilities.ndim > 1
|
475
|
+
)
|
476
|
+
|
477
|
+
if is_multi_dimensional:
|
478
|
+
# For multi-dimensional outputs, convert to a list of lists to store in DataFrame
|
479
|
+
self._df[prob_column] = list(map(list, prediction_probabilities))
|
480
|
+
else:
|
481
|
+
# If not multi-dimensional or a standard numpy array, reshape for compatibility
|
482
|
+
self._raw_dataset = np.hstack(
|
483
|
+
(self._raw_dataset, np.array(prediction_probabilities).reshape(-1, 1))
|
484
|
+
)
|
485
|
+
self._df[prob_column] = prediction_probabilities
|
486
|
+
|
487
|
+
# Update the dataset columns list
|
488
|
+
if prob_column not in self._columns:
|
489
|
+
self._columns.append(prob_column)
|
202
490
|
|
203
491
|
def assign_predictions( # noqa: C901 - we need to simplify this method
|
204
492
|
self,
|
205
493
|
model,
|
206
494
|
prediction_values: list = None,
|
495
|
+
prediction_probabilities: list = None,
|
207
496
|
prediction_column=None,
|
497
|
+
probability_column=None,
|
208
498
|
):
|
499
|
+
def _is_probability(output):
|
500
|
+
"""Check if the output from the predict method is probabilities."""
|
501
|
+
# This is a simple check that assumes output is probabilities if they lie between 0 and 1
|
502
|
+
if np.all((output >= 0) & (output <= 1)):
|
503
|
+
# Check if there is at least one element that is neither 0 nor 1
|
504
|
+
if np.any((output > 0) & (output < 1)):
|
505
|
+
return True
|
506
|
+
return np.all((output >= 0) & (output <= 1)) and np.any(
|
507
|
+
(output > 0) & (output < 1)
|
508
|
+
)
|
509
|
+
|
510
|
+
# Step 1: Check for Model Presence
|
209
511
|
if not model:
|
210
512
|
raise ValueError(
|
211
513
|
"Model must be provided to link prediction column with the dataset"
|
212
514
|
)
|
213
515
|
|
516
|
+
# Step 2: Prediction Column Provided
|
214
517
|
if prediction_column:
|
215
518
|
if prediction_column not in self.columns:
|
216
519
|
raise ValueError(
|
@@ -225,6 +528,8 @@ class NumpyDataset(VMDataset):
|
|
225
528
|
self._extra_columns.setdefault("prediction_columns", {})[
|
226
529
|
model.input_id
|
227
530
|
] = prediction_column
|
531
|
+
|
532
|
+
# Step 4: Prediction Values Provided without Specific Column
|
228
533
|
elif prediction_values is not None:
|
229
534
|
if len(prediction_values) != self.df.shape[0]:
|
230
535
|
raise ValueError(
|
@@ -232,13 +537,58 @@ class NumpyDataset(VMDataset):
|
|
232
537
|
)
|
233
538
|
pred_column = f"{model.input_id}_prediction"
|
234
539
|
if pred_column in self.columns:
|
235
|
-
|
236
|
-
f"Prediction column {pred_column} already exists in the dataset"
|
540
|
+
warnings.warn(
|
541
|
+
f"Prediction column {pred_column} already exists in the dataset, overwriting the existing predictions",
|
542
|
+
UserWarning,
|
237
543
|
)
|
544
|
+
|
545
|
+
logger.info(
|
546
|
+
f"Assigning prediction values to column '{pred_column}' and linked to model '{model.input_id}'"
|
547
|
+
)
|
238
548
|
self.__assign_prediction_values(model, pred_column, prediction_values)
|
549
|
+
|
550
|
+
# Step 3: Probability Column Provided
|
551
|
+
if probability_column:
|
552
|
+
if probability_column not in self.columns:
|
553
|
+
raise ValueError(
|
554
|
+
f"Probability column {probability_column} doesn't exist in the dataset"
|
555
|
+
)
|
556
|
+
if self.__model_id_in_probability_columns(
|
557
|
+
model=model, probability_column=probability_column
|
558
|
+
):
|
559
|
+
raise ValueError(
|
560
|
+
f"Probability column {probability_column} already linked to the VM model"
|
561
|
+
)
|
562
|
+
self._extra_columns.setdefault("probability_columns", {})[
|
563
|
+
model.input_id
|
564
|
+
] = probability_column
|
565
|
+
|
566
|
+
# Step 5: Prediction Probabilities Provided without Specific Column
|
567
|
+
elif prediction_probabilities is not None:
|
568
|
+
if len(prediction_probabilities) != self.df.shape[0]:
|
569
|
+
raise ValueError(
|
570
|
+
"Length of prediction probabilities doesn't match number of rows of the dataset"
|
571
|
+
)
|
572
|
+
prob_column = f"{model.input_id}_probabilities"
|
573
|
+
if prob_column in self.columns:
|
574
|
+
warnings.warn(
|
575
|
+
f"Probability column {prob_column} already exists in the dataset, overwriting the existing probabilities",
|
576
|
+
UserWarning,
|
577
|
+
)
|
578
|
+
|
579
|
+
logger.info(
|
580
|
+
f"Assigning prediction probabilities to column '{prob_column}' and linked to model '{model.input_id}'"
|
581
|
+
)
|
582
|
+
self.__assign_prediction_probabilities(
|
583
|
+
model, prob_column, prediction_probabilities
|
584
|
+
)
|
585
|
+
|
586
|
+
# Step 6: Neither Specific Column Nor Values Provided
|
239
587
|
elif not self.__model_id_in_prediction_columns(
|
240
588
|
model=model, prediction_column=prediction_column
|
241
589
|
):
|
590
|
+
|
591
|
+
# Compute prediction values directly from the VM model
|
242
592
|
pred_column = f"{model.input_id}_prediction"
|
243
593
|
if pred_column in self.columns:
|
244
594
|
logger.info(
|
@@ -256,12 +606,107 @@ class NumpyDataset(VMDataset):
|
|
256
606
|
)
|
257
607
|
|
258
608
|
prediction_values = np.array(model.predict(x_only))
|
259
|
-
|
609
|
+
|
610
|
+
# Check if the prediction values are probabilities
|
611
|
+
if _is_probability(prediction_values):
|
612
|
+
|
613
|
+
threshold = 0.5
|
614
|
+
|
615
|
+
logger.info(
|
616
|
+
"Predict method returned probabilities instead of direct labels or regression values. "
|
617
|
+
+ "This implies the model is likely configured for a classification task with probability output."
|
618
|
+
)
|
619
|
+
prob_column = f"{model.input_id}_probabilities"
|
620
|
+
logger.info(
|
621
|
+
f"Assigning probabilities to column '{prob_column}' and computing class labels using a threshold of {threshold}."
|
622
|
+
)
|
623
|
+
self.__assign_prediction_probabilities(
|
624
|
+
model, prob_column, prediction_values
|
625
|
+
)
|
626
|
+
|
627
|
+
# Convert probabilities to class labels based on the threshold
|
628
|
+
prediction_classes = (prediction_values > threshold).astype(int)
|
629
|
+
self.__assign_prediction_values(model, pred_column, prediction_classes)
|
630
|
+
|
631
|
+
else:
|
632
|
+
|
633
|
+
# If not assign the prediction values directly
|
634
|
+
pred_column = f"{model.input_id}_prediction"
|
635
|
+
self.__assign_prediction_values(model, pred_column, prediction_values)
|
636
|
+
|
637
|
+
try:
|
638
|
+
logger.info("Running predict_proba()... This may take a while")
|
639
|
+
prediction_probabilities = np.array(model.predict_proba(x_only))
|
640
|
+
prob_column = f"{model.input_id}_probabilities"
|
641
|
+
self.__assign_prediction_probabilities(
|
642
|
+
model, prob_column, prediction_probabilities
|
643
|
+
)
|
644
|
+
except MissingOrInvalidModelPredictFnError:
|
645
|
+
# Log that predict_proba is not available or failed
|
646
|
+
logger.warn(
|
647
|
+
f"Model class '{model.__class__}' does not have a compatible predict_proba implementation."
|
648
|
+
+ " Please assign predictions directly with vm_dataset.assign_predictions(model, prediction_values)"
|
649
|
+
)
|
650
|
+
|
651
|
+
# Step 7: Prediction Column Already Linked
|
260
652
|
else:
|
261
653
|
logger.info(
|
262
654
|
f"Prediction column {self._extra_columns['prediction_columns'][model.input_id]} already linked to the {model.input_id}"
|
263
655
|
)
|
264
656
|
|
657
|
+
def get_extra_column(self, column_name):
|
658
|
+
"""
|
659
|
+
Returns the values of the specified extra column.
|
660
|
+
|
661
|
+
Args:
|
662
|
+
column_name (str): The name of the extra column.
|
663
|
+
|
664
|
+
Returns:
|
665
|
+
np.ndarray: The values of the extra column.
|
666
|
+
"""
|
667
|
+
if column_name not in self.extra_columns:
|
668
|
+
raise ValueError(f"Column {column_name} is not an extra column")
|
669
|
+
|
670
|
+
return self._df[column_name]
|
671
|
+
|
672
|
+
def add_extra_column(self, column_name, column_values=None):
|
673
|
+
"""
|
674
|
+
Adds an extra column to the dataset without modifying the dataset `features` and `target` columns.
|
675
|
+
|
676
|
+
Args:
|
677
|
+
column_name (str): The name of the extra column.
|
678
|
+
column_values (np.ndarray, optional): The values of the extra column.
|
679
|
+
"""
|
680
|
+
if column_name in self.extra_columns:
|
681
|
+
logger.info(f"Column {column_name} already registered as an extra column")
|
682
|
+
return
|
683
|
+
|
684
|
+
# The column name already exists in the dataset so we just assign the extra column
|
685
|
+
if column_name in self.columns:
|
686
|
+
self._extra_columns[column_name] = column_name
|
687
|
+
logger.info(
|
688
|
+
f"Column {column_name} exists in the dataset, registering as an extra column"
|
689
|
+
)
|
690
|
+
return
|
691
|
+
|
692
|
+
if column_values is None:
|
693
|
+
raise ValueError(
|
694
|
+
"Column values must be provided when the column doesn't exist in the dataset"
|
695
|
+
)
|
696
|
+
|
697
|
+
if len(column_values) != self.df.shape[0]:
|
698
|
+
raise ValueError(
|
699
|
+
"Length of column values doesn't match number of rows of the dataset"
|
700
|
+
)
|
701
|
+
|
702
|
+
self._raw_dataset = np.hstack(
|
703
|
+
(self._raw_dataset, np.array(column_values).reshape(-1, 1))
|
704
|
+
)
|
705
|
+
self._columns.append(column_name)
|
706
|
+
self._df[column_name] = column_values
|
707
|
+
self._extra_columns[column_name] = column_name
|
708
|
+
logger.info(f"Column {column_name} added as an extra column")
|
709
|
+
|
265
710
|
@property
|
266
711
|
def raw_dataset(self) -> np.ndarray:
|
267
712
|
"""
|
@@ -397,21 +842,83 @@ class NumpyDataset(VMDataset):
|
|
397
842
|
],
|
398
843
|
]
|
399
844
|
|
400
|
-
def y_pred(self,
|
845
|
+
def y_pred(self, model) -> np.ndarray:
|
401
846
|
"""
|
402
|
-
Returns the prediction
|
847
|
+
Returns the prediction variables for a given model, accommodating
|
848
|
+
both scalar predictions and multi-dimensional outputs such as embeddings.
|
849
|
+
|
850
|
+
Args:
|
851
|
+
model (VMModel): The model whose predictions are sought.
|
403
852
|
|
404
853
|
Returns:
|
405
|
-
np.ndarray: The prediction variables
|
854
|
+
np.ndarray: The prediction variables, either as a flattened array for
|
855
|
+
scalar predictions or as an array of arrays for multi-dimensional outputs.
|
856
|
+
"""
|
857
|
+
pred_column = self.prediction_column(model)
|
858
|
+
|
859
|
+
# First, attempt to retrieve the prediction data from the DataFrame
|
860
|
+
if hasattr(self, "_df") and pred_column in self._df.columns:
|
861
|
+
predictions = self._df[pred_column].to_numpy()
|
862
|
+
|
863
|
+
# Check if the predictions are stored as objects (e.g., lists for embeddings)
|
864
|
+
if self._df[pred_column].dtype == object:
|
865
|
+
# Attempt to convert lists to a numpy array
|
866
|
+
try:
|
867
|
+
predictions = np.stack(predictions)
|
868
|
+
except ValueError as e:
|
869
|
+
# Handling cases where predictions cannot be directly stacked
|
870
|
+
raise ValueError(f"Error stacking prediction arrays: {e}")
|
871
|
+
else:
|
872
|
+
# Fallback to using the raw numpy dataset if DataFrame is not available or suitable
|
873
|
+
try:
|
874
|
+
predictions = self.raw_dataset[
|
875
|
+
:, self.columns.index(pred_column)
|
876
|
+
].flatten()
|
877
|
+
except IndexError as e:
|
878
|
+
raise ValueError(
|
879
|
+
f"Prediction column '{pred_column}' not found in raw dataset: {e}"
|
880
|
+
)
|
881
|
+
|
882
|
+
return predictions
|
883
|
+
|
884
|
+
def y_prob(self, model) -> np.ndarray:
|
406
885
|
"""
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
886
|
+
Returns the prediction variables for a given model, accommodating
|
887
|
+
both scalar predictions and multi-dimensional outputs such as embeddings.
|
888
|
+
|
889
|
+
Args:
|
890
|
+
model (str): The ID of the model whose predictions are sought.
|
891
|
+
|
892
|
+
Returns:
|
893
|
+
np.ndarray: The prediction variables, either as a flattened array for
|
894
|
+
scalar predictions or as an array of arrays for multi-dimensional outputs.
|
895
|
+
"""
|
896
|
+
prob_column = self.probability_column(model)
|
897
|
+
|
898
|
+
# First, attempt to retrieve the prediction data from the DataFrame
|
899
|
+
if hasattr(self, "_df") and prob_column in self._df.columns:
|
900
|
+
probabilities = self._df[prob_column].to_numpy()
|
901
|
+
|
902
|
+
# Check if the predictions are stored as objects (e.g., lists for embeddings)
|
903
|
+
if self._df[prob_column].dtype == object:
|
904
|
+
# Attempt to convert lists to a numpy array
|
905
|
+
try:
|
906
|
+
probabilities = np.stack(probabilities)
|
907
|
+
except ValueError as e:
|
908
|
+
# Handling cases where predictions cannot be directly stacked
|
909
|
+
raise ValueError(f"Error stacking prediction arrays: {e}")
|
910
|
+
else:
|
911
|
+
# Fallback to using the raw numpy dataset if DataFrame is not available or suitable
|
912
|
+
try:
|
913
|
+
probabilities = self.raw_dataset[
|
914
|
+
:, self.columns.index(prob_column)
|
915
|
+
].flatten()
|
916
|
+
except IndexError as e:
|
917
|
+
raise ValueError(
|
918
|
+
f"Prediction column '{prob_column}' not found in raw dataset: {e}"
|
919
|
+
)
|
920
|
+
|
921
|
+
return probabilities
|
415
922
|
|
416
923
|
@property
|
417
924
|
def type(self) -> str:
|
@@ -458,22 +965,32 @@ class NumpyDataset(VMDataset):
|
|
458
965
|
"""
|
459
966
|
return self._df[self.target_column]
|
460
967
|
|
461
|
-
def y_pred_df(self,
|
968
|
+
def y_pred_df(self, model):
|
462
969
|
"""
|
463
970
|
Returns the target columns (y) of the dataset.
|
464
971
|
|
465
972
|
Returns:
|
466
973
|
pd.DataFrame: The target columns.
|
467
974
|
"""
|
468
|
-
return self._df[self.prediction_column(
|
975
|
+
return self._df[self.prediction_column(model)]
|
469
976
|
|
470
|
-
def
|
977
|
+
def y_prob_df(self, model):
|
978
|
+
"""
|
979
|
+
Returns the target columns (y) of the dataset.
|
980
|
+
|
981
|
+
Returns:
|
982
|
+
pd.DataFrame: The target columns.
|
983
|
+
"""
|
984
|
+
return self._df[self.probability_column(model)]
|
985
|
+
|
986
|
+
def prediction_column(self, model) -> str:
|
471
987
|
"""
|
472
988
|
Returns the prediction column name of the dataset.
|
473
989
|
|
474
990
|
Returns:
|
475
991
|
str: The prediction column name.
|
476
992
|
"""
|
993
|
+
model_id = model.input_id
|
477
994
|
pred_column = self._extra_columns.get("prediction_columns", {}).get(model_id)
|
478
995
|
if pred_column is None:
|
479
996
|
raise ValueError(
|
@@ -481,6 +998,21 @@ class NumpyDataset(VMDataset):
|
|
481
998
|
)
|
482
999
|
return pred_column
|
483
1000
|
|
1001
|
+
def probability_column(self, model) -> str:
|
1002
|
+
"""
|
1003
|
+
Returns the prediction column name of the dataset.
|
1004
|
+
|
1005
|
+
Returns:
|
1006
|
+
str: The prediction column name.
|
1007
|
+
"""
|
1008
|
+
model_id = model.input_id
|
1009
|
+
prob_column = self._extra_columns.get("probability_columns", {}).get(model_id)
|
1010
|
+
if prob_column is None:
|
1011
|
+
raise ValueError(
|
1012
|
+
f"Probability column is not linked with the given {model_id}"
|
1013
|
+
)
|
1014
|
+
return prob_column
|
1015
|
+
|
484
1016
|
def serialize(self):
|
485
1017
|
"""
|
486
1018
|
Serializes the dataset to a dictionary.
|
@@ -608,10 +1140,15 @@ class DataFrameDataset(NumpyDataset):
|
|
608
1140
|
|
609
1141
|
Args:
|
610
1142
|
raw_dataset (pd.DataFrame): The raw dataset as a pandas DataFrame.
|
1143
|
+
input_id (str, optional): Identifier for the dataset. Defaults to None.
|
1144
|
+
model (VMModel, optional): Model associated with the dataset. Defaults to None.
|
611
1145
|
target_column (str, optional): The target column of the dataset. Defaults to None.
|
1146
|
+
extra_columns (dict, optional): Extra columns to include in the dataset. Defaults to None.
|
612
1147
|
feature_columns (list, optional): The feature columns of the dataset. Defaults to None.
|
613
|
-
text_column (str, optional): The text column name of the dataset for
|
614
|
-
target_class_labels (
|
1148
|
+
text_column (str, optional): The text column name of the dataset for NLP tasks. Defaults to None.
|
1149
|
+
target_class_labels (dict, optional): The class labels for the target columns. Defaults to None.
|
1150
|
+
options (dict, optional): Additional options for the dataset. Defaults to None.
|
1151
|
+
date_time_index (bool, optional): Whether to use date-time index. Defaults to False.
|
615
1152
|
"""
|
616
1153
|
index = None
|
617
1154
|
if isinstance(raw_dataset.index, pd.Index):
|
@@ -634,6 +1171,57 @@ class DataFrameDataset(NumpyDataset):
|
|
634
1171
|
)
|
635
1172
|
|
636
1173
|
|
1174
|
+
@dataclass
|
1175
|
+
class PolarsDataset(NumpyDataset):
|
1176
|
+
"""
|
1177
|
+
VM dataset implementation for Polars DataFrame.
|
1178
|
+
"""
|
1179
|
+
|
1180
|
+
def __init__(
|
1181
|
+
self,
|
1182
|
+
raw_dataset: pl.DataFrame,
|
1183
|
+
input_id: str = None,
|
1184
|
+
model: VMModel = None,
|
1185
|
+
target_column: str = None,
|
1186
|
+
extra_columns: dict = None,
|
1187
|
+
feature_columns: list = None,
|
1188
|
+
text_column: str = None,
|
1189
|
+
target_class_labels: dict = None,
|
1190
|
+
options: dict = None,
|
1191
|
+
date_time_index: bool = False,
|
1192
|
+
):
|
1193
|
+
"""
|
1194
|
+
Initializes a PolarsDataset instance.
|
1195
|
+
|
1196
|
+
Args:
|
1197
|
+
raw_dataset (pl.DataFrame): The raw dataset as a Polars DataFrame.
|
1198
|
+
input_id (str, optional): Identifier for the dataset. Defaults to None.
|
1199
|
+
model (VMModel, optional): Model associated with the dataset. Defaults to None.
|
1200
|
+
target_column (str, optional): The target column of the dataset. Defaults to None.
|
1201
|
+
extra_columns (dict, optional): Extra columns to include in the dataset. Defaults to None.
|
1202
|
+
feature_columns (list, optional): The feature columns of the dataset. Defaults to None.
|
1203
|
+
text_column (str, optional): The text column name of the dataset for NLP tasks. Defaults to None.
|
1204
|
+
target_class_labels (dict, optional): The class labels for the target columns. Defaults to None.
|
1205
|
+
options (dict, optional): Additional options for the dataset. Defaults to None.
|
1206
|
+
date_time_index (bool, optional): Whether to use date-time index. Defaults to False.
|
1207
|
+
"""
|
1208
|
+
super().__init__(
|
1209
|
+
raw_dataset=raw_dataset.to_numpy(),
|
1210
|
+
input_id=input_id,
|
1211
|
+
model=model,
|
1212
|
+
index_name=None,
|
1213
|
+
index=None,
|
1214
|
+
columns=raw_dataset.columns,
|
1215
|
+
target_column=target_column,
|
1216
|
+
extra_columns=extra_columns,
|
1217
|
+
feature_columns=feature_columns,
|
1218
|
+
text_column=text_column,
|
1219
|
+
target_class_labels=target_class_labels,
|
1220
|
+
options=options,
|
1221
|
+
date_time_index=date_time_index,
|
1222
|
+
)
|
1223
|
+
|
1224
|
+
|
637
1225
|
@dataclass
|
638
1226
|
class TorchDataset(NumpyDataset):
|
639
1227
|
"""
|
@@ -668,12 +1256,16 @@ class TorchDataset(NumpyDataset):
|
|
668
1256
|
text_column (str, optional): The text column name of the dataset for nlp tasks. Defaults to None.
|
669
1257
|
target_class_labels (Dict, optional): The class labels for the target columns. Defaults to None.
|
670
1258
|
"""
|
671
|
-
|
1259
|
+
|
672
1260
|
try:
|
673
1261
|
import torch
|
674
1262
|
except ImportError:
|
675
|
-
|
1263
|
+
raise ImportError(
|
1264
|
+
"PyTorch is not installed, please run `pip install validmind[pytorch]`"
|
1265
|
+
)
|
1266
|
+
|
676
1267
|
columns = []
|
1268
|
+
|
677
1269
|
for id, tens in zip(range(0, len(raw_dataset.tensors)), raw_dataset.tensors):
|
678
1270
|
if id == 0 and feature_columns is None:
|
679
1271
|
n_cols = tens.shape[1]
|
@@ -684,9 +1276,11 @@ class TorchDataset(NumpyDataset):
|
|
684
1276
|
).astype(str)
|
685
1277
|
]
|
686
1278
|
columns.append(feature_columns)
|
1279
|
+
|
687
1280
|
elif id == 1 and target_column is None:
|
688
1281
|
target_column = "y"
|
689
1282
|
columns.append(target_column)
|
1283
|
+
|
690
1284
|
elif id == 2 and extra_columns is None:
|
691
1285
|
extra_columns.prediction_column = "y_pred"
|
692
1286
|
columns.append(extra_columns.prediction_column)
|