validmind 2.7.4__py3-none-any.whl → 2.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/datasets/credit_risk/lending_club.py +354 -88
  3. validmind/tests/data_validation/HighPearsonCorrelation.py +12 -2
  4. validmind/tests/load.py +4 -1
  5. validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py +218 -0
  6. validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py +153 -0
  7. validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py +144 -0
  8. validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py +146 -0
  9. validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py +191 -0
  10. validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py +176 -0
  11. validmind/tests/ongoing_monitoring/FeatureDrift.py +120 -121
  12. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +18 -23
  13. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +86 -45
  14. validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +202 -0
  15. validmind/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.py +97 -0
  16. validmind/tests/ongoing_monitoring/ROCCurveDrift.py +149 -0
  17. validmind/tests/ongoing_monitoring/ScoreBandsDrift.py +210 -0
  18. validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py +207 -0
  19. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +91 -14
  20. validmind/vm_models/dataset/dataset.py +0 -4
  21. {validmind-2.7.4.dist-info → validmind-2.7.6.dist-info}/METADATA +3 -3
  22. {validmind-2.7.4.dist-info → validmind-2.7.6.dist-info}/RECORD +25 -14
  23. {validmind-2.7.4.dist-info → validmind-2.7.6.dist-info}/WHEEL +1 -1
  24. {validmind-2.7.4.dist-info → validmind-2.7.6.dist-info}/LICENSE +0 -0
  25. {validmind-2.7.4.dist-info → validmind-2.7.6.dist-info}/entry_points.txt +0 -0
validmind/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.7.4"
1
+ __version__ = "2.7.6"
@@ -3,13 +3,20 @@
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
5
  import os
6
-
6
+ import warnings
7
+ import logging
7
8
  import numpy as np
8
9
  import pandas as pd
9
10
  import scorecardpy as sc
10
11
  import statsmodels.api as sm
12
+
13
+ import xgboost as xgb
14
+ import validmind as vm
15
+
16
+ from sklearn.ensemble import RandomForestClassifier
11
17
  from sklearn.model_selection import train_test_split
12
18
 
19
+
13
20
  current_path = os.path.dirname(os.path.abspath(__file__))
14
21
  dataset_path = os.path.join(current_path, "datasets")
15
22
 
@@ -95,7 +102,7 @@ score_params = {
95
102
  }
96
103
 
97
104
 
98
- def load_data(source="online"):
105
+ def load_data(source="online", verbose=True):
99
106
  """
100
107
  Load data from either an online source or offline files, automatically dropping specified columns for offline data.
101
108
 
@@ -104,28 +111,33 @@ def load_data(source="online"):
104
111
  """
105
112
 
106
113
  if source == "online":
107
- print(f"Loading data from an online source: {online_data_file}")
114
+ if verbose:
115
+ print(f"Loading data from an online source: {online_data_file}")
108
116
  df = pd.read_csv(online_data_file)
109
- df = _clean_data(df)
117
+ df = _clean_data(df, verbose=verbose)
110
118
 
111
119
  elif source == "offline":
112
- print(f"Loading data from an offline .gz file: {offline_data_file}")
120
+ if verbose:
121
+ print(f"Loading data from an offline .gz file: {offline_data_file}")
113
122
  # Since we know the offline_data_file path ends with '.zip', we replace it with '.csv.gz'
114
123
  gzip_file_path = offline_data_file.replace(".zip", ".csv.gz")
115
- print(f"Attempting to read from .gz file: {gzip_file_path}")
124
+ if verbose:
125
+ print(f"Attempting to read from .gz file: {gzip_file_path}")
116
126
  # Read the CSV file directly from the .gz archive
117
127
  df = pd.read_csv(gzip_file_path, compression="gzip")
118
- print("Data loaded successfully.")
128
+ if verbose:
129
+ print("Data loaded successfully.")
119
130
  else:
120
131
  raise ValueError("Invalid source specified. Choose 'online' or 'offline'.")
121
132
 
122
- print(
123
- f"Rows: {df.shape[0]}, Columns: {df.shape[1]}, Missing values: {df.isnull().sum().sum()}"
124
- )
133
+ if verbose:
134
+ print(
135
+ f"Rows: {df.shape[0]}, Columns: {df.shape[1]}, Missing values: {df.isnull().sum().sum()}"
136
+ )
125
137
  return df
126
138
 
127
139
 
128
- def _clean_data(df):
140
+ def _clean_data(df, verbose=True):
129
141
  df = df.copy()
130
142
 
131
143
  # Drop columns not relevant for application scorecards
@@ -133,41 +145,45 @@ def _clean_data(df):
133
145
 
134
146
  # Drop rows with missing target values
135
147
  df.dropna(subset=[target_column], inplace=True)
136
- print("Dropping rows with missing target values:")
137
- print(
138
- f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
139
- )
148
+ if verbose:
149
+ print("Dropping rows with missing target values:")
150
+ print(
151
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
152
+ )
140
153
 
141
154
  # Drop columns with more than N percent missing values
142
155
  missing_values = df.isnull().mean()
143
156
  df = df.loc[:, missing_values < 0.7]
144
- print("Dropping columns with more than 70% missing values:")
145
- print(
146
- f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
147
- )
157
+ if verbose:
158
+ print("Dropping columns with more than 70% missing values:")
159
+ print(
160
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
161
+ )
148
162
 
149
163
  # Drop columns with only one unique value
150
164
  unique_values = df.nunique()
151
165
  df = df.loc[:, unique_values > 1]
152
- print("Dropping columns with only one unique value:")
153
- print(
154
- f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
155
- )
166
+ if verbose:
167
+ print("Dropping columns with only one unique value:")
168
+ print(
169
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
170
+ )
156
171
 
157
172
  # Define the target variable for the model, representing loan default status.
158
173
  df[target_column] = df[target_column].map({"Fully Paid": 0, "Charged Off": 1})
159
174
 
160
175
  # Drop rows with NaN in target_column after mapping
161
176
  df.dropna(subset=[target_column], inplace=True)
162
- print("Dropping rows with missing target values:")
163
- print(
164
- f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
165
- )
177
+ if verbose:
178
+ print("Dropping rows with missing target values:")
179
+ print(
180
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
181
+ )
166
182
 
167
183
  return df
168
184
 
169
185
 
170
- def preprocess(df):
186
+ def preprocess(df, verbose=True):
171
187
  df = df.copy()
172
188
 
173
189
  # Convert the target variable to integer type for modeling.
@@ -175,45 +191,51 @@ def preprocess(df):
175
191
 
176
192
  # Keep rows where purpose is 'debt_consolidation' or 'credit_card'
177
193
  df = df[df["purpose"].isin(["debt_consolidation", "credit_card"])]
178
- print("Filtering 'purpose' to 'debt_consolidation' and 'credit_card':")
179
- print(
180
- f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
181
- )
194
+ if verbose:
195
+ print("Filtering 'purpose' to 'debt_consolidation' and 'credit_card':")
196
+ print(
197
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
198
+ )
182
199
 
183
200
  # Remove rows where grade is 'F' or 'G'
184
201
  df = df[~df["grade"].isin(["F", "G"])]
185
- print("Filtering out 'grade' F and G:")
186
- print(
187
- f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
188
- )
202
+ if verbose:
203
+ print("Filtering out 'grade' F and G:")
204
+ print(
205
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
206
+ )
189
207
 
190
208
  # Remove rows where sub_grade starts with 'F' or 'G'
191
209
  df = df[~df["sub_grade"].str.startswith(("F", "G"))]
192
- print("Filtering out 'sub_grade' F and G:")
193
- print(
194
- f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
195
- )
210
+ if verbose:
211
+ print("Filtering out 'sub_grade' F and G:")
212
+ print(
213
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
214
+ )
196
215
 
197
216
  # Remove rows where home_ownership is 'OTHER', 'NONE', or 'ANY'
198
217
  df = df[~df["home_ownership"].isin(["OTHER", "NONE", "ANY"])]
199
- print("Filtering out 'home_ownership' OTHER, NONE, ANY:")
200
- print(
201
- f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
202
- )
218
+ if verbose:
219
+ print("Filtering out 'home_ownership' OTHER, NONE, ANY:")
220
+ print(
221
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
222
+ )
203
223
 
204
224
  # Drop features that are not useful for modeling
205
225
  df.drop(drop_features, axis=1, inplace=True)
206
- print("Dropping specified features:")
207
- print(
208
- f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
209
- )
226
+ if verbose:
227
+ print("Dropping specified features:")
228
+ print(
229
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
230
+ )
210
231
 
211
232
  # Drop rows with missing values
212
233
  df.dropna(inplace=True)
213
- print("Dropping rows with any missing values:")
214
- print(
215
- f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
216
- )
234
+ if verbose:
235
+ print("Dropping rows with any missing values:")
236
+ print(
237
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
238
+ )
217
239
 
218
240
  # Preprocess emp_length column
219
241
  df = _preprocess_emp_length(df)
@@ -260,34 +282,37 @@ def _preprocess_emp_length(df):
260
282
  return df
261
283
 
262
284
 
263
- def feature_engineering(df):
285
+ def feature_engineering(df, verbose=True):
264
286
  df = df.copy()
265
287
 
266
288
  # WoE encoding of numerical and categorical features
267
- df = woe_encoding(df)
289
+ df = woe_encoding(df, verbose=verbose)
268
290
 
269
- print(
270
- f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
271
- )
291
+ if verbose:
292
+ print(
293
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
294
+ )
272
295
 
273
296
  return df
274
297
 
275
298
 
276
- def woe_encoding(df):
299
+ def woe_encoding(df, verbose=True):
277
300
  df = df.copy()
278
301
 
279
- woe = _woebin(df)
302
+ woe = _woebin(df, verbose=verbose)
280
303
  bins = _woe_to_bins(woe)
281
304
 
282
305
  # Make sure we don't transform the target column
283
306
  if target_column in bins:
284
307
  del bins[target_column]
285
- print(f"Excluded {target_column} from WoE transformation.")
308
+ if verbose:
309
+ print(f"Excluded {target_column} from WoE transformation.")
286
310
 
287
311
  # Apply the WoE transformation
288
312
  df = sc.woebin_ply(df, bins=bins)
289
313
 
290
- print("Successfully converted features to WoE values.")
314
+ if verbose:
315
+ print("Successfully converted features to WoE values.")
291
316
 
292
317
  return df
293
318
 
@@ -326,7 +351,7 @@ def _woe_to_bins(woe):
326
351
  return bins
327
352
 
328
353
 
329
- def _woebin(df):
354
+ def _woebin(df, verbose=True):
330
355
  """
331
356
  This function performs automatic binning using WoE.
332
357
  df: A pandas dataframe
@@ -337,9 +362,10 @@ def _woebin(df):
337
362
  df[non_numeric_cols] = df[non_numeric_cols].astype(str)
338
363
 
339
364
  try:
340
- print(
341
- f"Performing binning with breaks_adj: {breaks_adj}"
342
- ) # print the breaks_adj being used
365
+ if verbose:
366
+ print(
367
+ f"Performing binning with breaks_adj: {breaks_adj}"
368
+ ) # print the breaks_adj being used
343
369
  bins = sc.woebin(df, target_column, breaks_list=breaks_adj)
344
370
  except Exception as e:
345
371
  print("Error during binning: ")
@@ -355,7 +381,7 @@ def _woebin(df):
355
381
  return bins_df
356
382
 
357
383
 
358
- def split(df, validation_size=None, test_size=0.2, add_constant=False):
384
+ def split(df, validation_size=None, test_size=0.2, add_constant=False, verbose=True):
359
385
  """
360
386
  Split dataset into train, validation (optional), and test sets.
361
387
 
@@ -384,15 +410,16 @@ def split(df, validation_size=None, test_size=0.2, add_constant=False):
384
410
  train_val_df = sm.add_constant(train_val_df)
385
411
 
386
412
  # Print details for two-way split
387
- print("After splitting the dataset into training and test sets:")
388
- print(
389
- f"Training Dataset:\nRows: {train_val_df.shape[0]}\nColumns: {train_val_df.shape[1]}\n"
390
- f"Missing values: {train_val_df.isnull().sum().sum()}\n"
391
- )
392
- print(
393
- f"Test Dataset:\nRows: {test_df.shape[0]}\nColumns: {test_df.shape[1]}\n"
394
- f"Missing values: {test_df.isnull().sum().sum()}\n"
395
- )
413
+ if verbose:
414
+ print("After splitting the dataset into training and test sets:")
415
+ print(
416
+ f"Training Dataset:\nRows: {train_val_df.shape[0]}\nColumns: {train_val_df.shape[1]}\n"
417
+ f"Missing values: {train_val_df.isnull().sum().sum()}\n"
418
+ )
419
+ print(
420
+ f"Test Dataset:\nRows: {test_df.shape[0]}\nColumns: {test_df.shape[1]}\n"
421
+ f"Missing values: {test_df.isnull().sum().sum()}\n"
422
+ )
396
423
 
397
424
  return train_val_df, test_df
398
425
 
@@ -407,19 +434,20 @@ def split(df, validation_size=None, test_size=0.2, add_constant=False):
407
434
  validation_df = sm.add_constant(validation_df)
408
435
 
409
436
  # Print details for three-way split
410
- print("After splitting the dataset into training, validation, and test sets:")
411
- print(
412
- f"Training Dataset:\nRows: {train_df.shape[0]}\nColumns: {train_df.shape[1]}\n"
413
- f"Missing values: {train_df.isnull().sum().sum()}\n"
414
- )
415
- print(
416
- f"Validation Dataset:\nRows: {validation_df.shape[0]}\nColumns: {validation_df.shape[1]}\n"
417
- f"Missing values: {validation_df.isnull().sum().sum()}\n"
418
- )
419
- print(
420
- f"Test Dataset:\nRows: {test_df.shape[0]}\nColumns: {test_df.shape[1]}\n"
421
- f"Missing values: {test_df.isnull().sum().sum()}\n"
422
- )
437
+ if verbose:
438
+ print("After splitting the dataset into training, validation, and test sets:")
439
+ print(
440
+ f"Training Dataset:\nRows: {train_df.shape[0]}\nColumns: {train_df.shape[1]}\n"
441
+ f"Missing values: {train_df.isnull().sum().sum()}\n"
442
+ )
443
+ print(
444
+ f"Validation Dataset:\nRows: {validation_df.shape[0]}\nColumns: {validation_df.shape[1]}\n"
445
+ f"Missing values: {validation_df.isnull().sum().sum()}\n"
446
+ )
447
+ print(
448
+ f"Test Dataset:\nRows: {test_df.shape[0]}\nColumns: {test_df.shape[1]}\n"
449
+ f"Missing values: {test_df.isnull().sum().sum()}\n"
450
+ )
423
451
 
424
452
  return train_df, validation_df, test_df
425
453
 
@@ -822,3 +850,241 @@ def get_demo_test_config(x_test=None, y_test=None):
822
850
  }
823
851
 
824
852
  return default_config
853
+
854
+
855
+ def load_scorecard():
856
+
857
+ warnings.filterwarnings("ignore")
858
+ logging.getLogger("scorecardpy").setLevel(logging.ERROR)
859
+
860
+ os.environ["VALIDMIND_LLM_DESCRIPTIONS_CONTEXT_ENABLED"] = "1"
861
+
862
+ context = """
863
+ FORMAT FOR THE LLM DESCRIPTIONS:
864
+ **<Test Name>** is designed to <begin with a concise overview of what the test does and its primary purpose, extracted from the test description>.
865
+
866
+ The test operates by <write a paragraph about the test mechanism, explaining how it works and what it measures. Include any relevant formulas or methodologies mentioned in the test description.>
867
+
868
+ The primary advantages of this test include <write a paragraph about the test's strengths and capabilities, highlighting what makes it particularly useful for specific scenarios.>
869
+
870
+ Users should be aware that <write a paragraph about the test's limitations and potential risks. Include both technical limitations and interpretation challenges. If the test description includes specific signs of high risk, incorporate these here.>
871
+
872
+ **Key Insights:**
873
+
874
+ The test results reveal:
875
+
876
+ - **<insight title>**: <comprehensive description of one aspect of the results>
877
+ - **<insight title>**: <comprehensive description of another aspect>
878
+ ...
879
+
880
+ Based on these results, <conclude with a brief paragraph that ties together the test results with the test's purpose and provides any final recommendations or considerations.>
881
+
882
+ ADDITIONAL INSTRUCTIONS:
883
+ Present insights in order from general to specific, with each insight as a single bullet point with bold title.
884
+
885
+ For each metric in the test results, include in the test overview:
886
+ - The metric's purpose and what it measures
887
+ - Its mathematical formula
888
+ - The range of possible values
889
+ - What constitutes good/bad performance
890
+ - How to interpret different values
891
+
892
+ Each insight should progressively cover:
893
+ 1. Overall scope and distribution
894
+ 2. Complete breakdown of all elements with specific values
895
+ 3. Natural groupings and patterns
896
+ 4. Comparative analysis between datasets/categories
897
+ 5. Stability and variations
898
+ 6. Notable relationships or dependencies
899
+
900
+ Remember:
901
+ - Keep all insights at the same level (no sub-bullets or nested structures)
902
+ - Make each insight complete and self-contained
903
+ - Include specific numerical values and ranges
904
+ - Cover all elements in the results comprehensively
905
+ - Maintain clear, concise language
906
+ - Use only "- **Title**: Description" format for insights
907
+ - Progress naturally from general to specific observations
908
+
909
+ """.strip()
910
+
911
+ os.environ["VALIDMIND_LLM_DESCRIPTIONS_CONTEXT"] = context
912
+
913
+ # Load the data
914
+ df = load_data(source="offline", verbose=False)
915
+ preprocess_df = preprocess(df, verbose=False)
916
+ fe_df = feature_engineering(preprocess_df, verbose=False)
917
+
918
+ # Split the data
919
+ train_df, test_df = split(fe_df, test_size=0.2, verbose=False)
920
+
921
+ x_train = train_df.drop(target_column, axis=1)
922
+ y_train = train_df[target_column]
923
+
924
+ x_test = test_df.drop(target_column, axis=1)
925
+ y_test = test_df[target_column]
926
+
927
+ # Define the XGBoost model
928
+ xgb_model = xgb.XGBClassifier(
929
+ n_estimators=50, random_state=42, early_stopping_rounds=10
930
+ )
931
+ xgb_model.set_params(
932
+ eval_metric=["error", "logloss", "auc"],
933
+ )
934
+
935
+ # Fit the model
936
+ xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False)
937
+
938
+ # Define the Random Forest model
939
+ rf_model = RandomForestClassifier(
940
+ n_estimators=50,
941
+ random_state=42,
942
+ )
943
+
944
+ # Fit the model
945
+ rf_model.fit(x_train, y_train)
946
+
947
+ # Compute the probabilities
948
+ train_xgb_prob = xgb_model.predict_proba(x_train)[:, 1]
949
+ test_xgb_prob = xgb_model.predict_proba(x_test)[:, 1]
950
+
951
+ train_rf_prob = rf_model.predict_proba(x_train)[:, 1]
952
+ test_rf_prob = rf_model.predict_proba(x_test)[:, 1]
953
+
954
+ # Compute binary predictions
955
+ cut_off_threshold = 0.3
956
+
957
+ train_xgb_binary_predictions = (train_xgb_prob > cut_off_threshold).astype(int)
958
+ test_xgb_binary_predictions = (test_xgb_prob > cut_off_threshold).astype(int)
959
+
960
+ train_rf_binary_predictions = (train_rf_prob > cut_off_threshold).astype(int)
961
+ test_rf_binary_predictions = (test_rf_prob > cut_off_threshold).astype(int)
962
+
963
+ # Compute credit risk scores
964
+ train_xgb_scores = compute_scores(train_xgb_prob)
965
+ test_xgb_scores = compute_scores(test_xgb_prob)
966
+
967
+ scorecard = {
968
+ "df": df,
969
+ "preprocess_df": preprocess_df,
970
+ "fe_df": fe_df,
971
+ "train_df": train_df,
972
+ "test_df": test_df,
973
+ "x_test": x_test,
974
+ "y_test": y_test,
975
+ "xgb_model": xgb_model,
976
+ "rf_model": rf_model,
977
+ "train_xgb_binary_predictions": train_xgb_binary_predictions,
978
+ "test_xgb_binary_predictions": test_xgb_binary_predictions,
979
+ "train_xgb_prob": train_xgb_prob,
980
+ "test_xgb_prob": test_xgb_prob,
981
+ "train_xgb_scores": train_xgb_scores,
982
+ "test_xgb_scores": test_xgb_scores,
983
+ "train_rf_binary_predictions": train_rf_binary_predictions,
984
+ "test_rf_binary_predictions": test_rf_binary_predictions,
985
+ "train_rf_prob": train_rf_prob,
986
+ "test_rf_prob": test_rf_prob,
987
+ }
988
+
989
+ return scorecard
990
+
991
+
992
+ def init_vm_objects(scorecard):
993
+
994
+ df = scorecard["df"]
995
+ preprocess_df = scorecard["preprocess_df"]
996
+ fe_df = scorecard["fe_df"]
997
+ train_df = scorecard["train_df"]
998
+ test_df = scorecard["test_df"]
999
+ xgb_model = scorecard["xgb_model"]
1000
+ rf_model = scorecard["rf_model"]
1001
+ train_xgb_binary_predictions = scorecard["train_xgb_binary_predictions"]
1002
+ test_xgb_binary_predictions = scorecard["test_xgb_binary_predictions"]
1003
+ train_xgb_prob = scorecard["train_xgb_prob"]
1004
+ test_xgb_prob = scorecard["test_xgb_prob"]
1005
+ train_rf_binary_predictions = scorecard["train_rf_binary_predictions"]
1006
+ test_rf_binary_predictions = scorecard["test_rf_binary_predictions"]
1007
+ train_rf_prob = scorecard["train_rf_prob"]
1008
+ test_rf_prob = scorecard["test_rf_prob"]
1009
+ train_xgb_scores = scorecard["train_xgb_scores"]
1010
+ test_xgb_scores = scorecard["test_xgb_scores"]
1011
+
1012
+ vm.init_dataset(
1013
+ dataset=df,
1014
+ input_id="raw_dataset",
1015
+ target_column=target_column,
1016
+ )
1017
+
1018
+ vm.init_dataset(
1019
+ dataset=preprocess_df,
1020
+ input_id="preprocess_dataset",
1021
+ target_column=target_column,
1022
+ )
1023
+
1024
+ vm.init_dataset(
1025
+ dataset=fe_df,
1026
+ input_id="fe_dataset",
1027
+ target_column=target_column,
1028
+ )
1029
+
1030
+ vm_train_ds = vm.init_dataset(
1031
+ dataset=train_df,
1032
+ input_id="train_dataset",
1033
+ target_column=target_column,
1034
+ )
1035
+
1036
+ vm_test_ds = vm.init_dataset(
1037
+ dataset=test_df,
1038
+ input_id="test_dataset",
1039
+ target_column=target_column,
1040
+ )
1041
+
1042
+ vm_xgb_model = vm.init_model(
1043
+ xgb_model,
1044
+ input_id="xgb_model",
1045
+ )
1046
+
1047
+ vm_rf_model = vm.init_model(
1048
+ rf_model,
1049
+ input_id="rf_model",
1050
+ )
1051
+
1052
+ # Assign predictions
1053
+ vm_train_ds.assign_predictions(
1054
+ model=vm_xgb_model,
1055
+ prediction_values=train_xgb_binary_predictions,
1056
+ prediction_probabilities=train_xgb_prob,
1057
+ )
1058
+
1059
+ vm_test_ds.assign_predictions(
1060
+ model=vm_xgb_model,
1061
+ prediction_values=test_xgb_binary_predictions,
1062
+ prediction_probabilities=test_xgb_prob,
1063
+ )
1064
+
1065
+ vm_train_ds.assign_predictions(
1066
+ model=vm_rf_model,
1067
+ prediction_values=train_rf_binary_predictions,
1068
+ prediction_probabilities=train_rf_prob,
1069
+ )
1070
+
1071
+ vm_test_ds.assign_predictions(
1072
+ model=vm_rf_model,
1073
+ prediction_values=test_rf_binary_predictions,
1074
+ prediction_probabilities=test_rf_prob,
1075
+ )
1076
+
1077
+ # Assign scores to the datasets
1078
+ vm_train_ds.add_extra_column("xgb_scores", train_xgb_scores)
1079
+ vm_test_ds.add_extra_column("xgb_scores", test_xgb_scores)
1080
+
1081
+
1082
+ def load_test_config(scorecard):
1083
+
1084
+ x_test = scorecard["x_test"]
1085
+ y_test = scorecard["y_test"]
1086
+
1087
+ # Get the test config
1088
+ test_config = get_demo_test_config(x_test, y_test)
1089
+
1090
+ return test_config
@@ -9,7 +9,10 @@ from validmind.vm_models import VMDataset
9
9
  @tags("tabular_data", "data_quality", "correlation")
10
10
  @tasks("classification", "regression")
11
11
  def HighPearsonCorrelation(
12
- dataset: VMDataset, max_threshold: float = 0.3, top_n_correlations: int = 10
12
+ dataset: VMDataset,
13
+ max_threshold: float = 0.3,
14
+ top_n_correlations: int = 10,
15
+ feature_columns: list = None,
13
16
  ):
14
17
  """
15
18
  Identifies highly correlated feature pairs in a dataset suggesting feature redundancy or multicollinearity.
@@ -51,8 +54,15 @@ def HighPearsonCorrelation(
51
54
  - Limited to identifying redundancy only within feature pairs; may fail to spot more complex relationships among
52
55
  three or more variables.
53
56
  """
57
+
58
+ # Select features
59
+ if feature_columns is None:
60
+ df = dataset.df
61
+ else:
62
+ df = dataset.df[feature_columns]
63
+
54
64
  # Get correlation matrix for numeric columns
55
- corr = dataset.df.corr(numeric_only=True)
65
+ corr = df.corr(numeric_only=True)
56
66
 
57
67
  # Create table of correlation coefficients and column pairs
58
68
  pairs = []
validmind/tests/load.py CHANGED
@@ -191,7 +191,7 @@ def list_tags():
191
191
  return list(unique_tags)
192
192
 
193
193
 
194
- def list_tasks_and_tags():
194
+ def list_tasks_and_tags(as_json=False):
195
195
  """
196
196
  List all task types and their associated tags, with one row per task type and
197
197
  all tags for a task type in one row.
@@ -205,6 +205,9 @@ def list_tasks_and_tags():
205
205
  for task in test.__tasks__:
206
206
  task_tags_dict.setdefault(task, set()).update(test.__tags__)
207
207
 
208
+ if as_json:
209
+ return task_tags_dict
210
+
208
211
  return format_dataframe(
209
212
  pd.DataFrame(
210
213
  [