validmind 2.0.7__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. validmind/__init__.py +3 -3
  2. validmind/__version__.py +1 -1
  3. validmind/ai.py +7 -11
  4. validmind/api_client.py +29 -27
  5. validmind/client.py +10 -3
  6. validmind/datasets/credit_risk/__init__.py +11 -0
  7. validmind/datasets/credit_risk/datasets/lending_club_loan_data_2007_2014_clean.csv.gz +0 -0
  8. validmind/datasets/credit_risk/lending_club.py +394 -0
  9. validmind/logging.py +9 -2
  10. validmind/template.py +2 -2
  11. validmind/test_suites/__init__.py +4 -2
  12. validmind/tests/__init__.py +97 -50
  13. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +3 -1
  14. validmind/tests/data_validation/PiTCreditScoresHistogram.py +1 -1
  15. validmind/tests/data_validation/ScatterPlot.py +8 -2
  16. validmind/tests/decorator.py +138 -14
  17. validmind/tests/model_validation/BertScore.py +1 -1
  18. validmind/tests/model_validation/BertScoreAggregate.py +1 -1
  19. validmind/tests/model_validation/BleuScore.py +1 -1
  20. validmind/tests/model_validation/ClusterSizeDistribution.py +1 -1
  21. validmind/tests/model_validation/ContextualRecall.py +1 -1
  22. validmind/tests/model_validation/FeaturesAUC.py +110 -0
  23. validmind/tests/model_validation/MeteorScore.py +1 -1
  24. validmind/tests/model_validation/RegardHistogram.py +1 -1
  25. validmind/tests/model_validation/RegardScore.py +1 -1
  26. validmind/tests/model_validation/RegressionResidualsPlot.py +127 -0
  27. validmind/tests/model_validation/RougeMetrics.py +1 -1
  28. validmind/tests/model_validation/RougeMetricsAggregate.py +1 -1
  29. validmind/tests/model_validation/SelfCheckNLIScore.py +1 -1
  30. validmind/tests/model_validation/TokenDisparity.py +1 -1
  31. validmind/tests/model_validation/ToxicityHistogram.py +1 -1
  32. validmind/tests/model_validation/ToxicityScore.py +1 -1
  33. validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
  34. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +1 -3
  35. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +1 -1
  36. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +1 -1
  37. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +15 -18
  38. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +1 -1
  39. validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
  40. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +21 -3
  41. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +1 -1
  42. validmind/tests/model_validation/sklearn/MinimumF1Score.py +1 -1
  43. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +1 -1
  44. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +5 -4
  45. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +2 -2
  46. validmind/tests/model_validation/sklearn/ROCCurve.py +6 -12
  47. validmind/tests/model_validation/sklearn/RegressionErrors.py +2 -2
  48. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +6 -4
  49. validmind/tests/model_validation/sklearn/RegressionR2Square.py +2 -2
  50. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +33 -3
  51. validmind/tests/model_validation/sklearn/SilhouettePlot.py +1 -1
  52. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +2 -2
  53. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +2 -2
  54. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +140 -0
  55. validmind/tests/model_validation/statsmodels/GINITable.py +22 -45
  56. validmind/tests/model_validation/statsmodels/{LogisticRegPredictionHistogram.py → PredictionProbabilitiesHistogram.py} +67 -92
  57. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +2 -2
  58. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -2
  59. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  60. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  61. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  62. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  63. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +128 -0
  64. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +70 -103
  65. validmind/tests/test_providers.py +14 -124
  66. validmind/unit_metrics/__init__.py +76 -69
  67. validmind/unit_metrics/classification/sklearn/Accuracy.py +14 -0
  68. validmind/unit_metrics/classification/sklearn/F1.py +13 -0
  69. validmind/unit_metrics/classification/sklearn/Precision.py +13 -0
  70. validmind/unit_metrics/classification/sklearn/ROC_AUC.py +13 -0
  71. validmind/unit_metrics/classification/sklearn/Recall.py +13 -0
  72. validmind/unit_metrics/composite.py +24 -71
  73. validmind/unit_metrics/regression/GiniCoefficient.py +20 -26
  74. validmind/unit_metrics/regression/HuberLoss.py +12 -16
  75. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +18 -24
  76. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +7 -13
  77. validmind/unit_metrics/regression/MeanBiasDeviation.py +5 -14
  78. validmind/unit_metrics/regression/QuantileLoss.py +6 -16
  79. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +12 -18
  80. validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +6 -15
  81. validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +5 -14
  82. validmind/unit_metrics/regression/sklearn/RSquaredScore.py +6 -15
  83. validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +11 -14
  84. validmind/utils.py +18 -45
  85. validmind/vm_models/__init__.py +0 -2
  86. validmind/vm_models/dataset.py +255 -16
  87. validmind/vm_models/test/metric.py +1 -2
  88. validmind/vm_models/test/result_wrapper.py +12 -13
  89. validmind/vm_models/test/test.py +2 -1
  90. validmind/vm_models/test/threshold_test.py +1 -2
  91. validmind/vm_models/test_suite/summary.py +3 -3
  92. validmind/vm_models/test_suite/test_suite.py +2 -1
  93. {validmind-2.0.7.dist-info → validmind-2.1.1.dist-info}/METADATA +10 -6
  94. {validmind-2.0.7.dist-info → validmind-2.1.1.dist-info}/RECORD +97 -96
  95. validmind/tests/__types__.py +0 -62
  96. validmind/tests/model_validation/statsmodels/LogRegressionConfusionMatrix.py +0 -128
  97. validmind/tests/model_validation/statsmodels/LogisticRegCumulativeProb.py +0 -172
  98. validmind/tests/model_validation/statsmodels/ScorecardBucketHistogram.py +0 -181
  99. validmind/tests/model_validation/statsmodels/ScorecardProbabilitiesHistogram.py +0 -175
  100. validmind/unit_metrics/sklearn/classification/Accuracy.py +0 -22
  101. validmind/unit_metrics/sklearn/classification/F1.py +0 -24
  102. validmind/unit_metrics/sklearn/classification/Precision.py +0 -24
  103. validmind/unit_metrics/sklearn/classification/ROC_AUC.py +0 -22
  104. validmind/unit_metrics/sklearn/classification/Recall.py +0 -22
  105. validmind/vm_models/test/unit_metric.py +0 -88
  106. {validmind-2.0.7.dist-info → validmind-2.1.1.dist-info}/LICENSE +0 -0
  107. {validmind-2.0.7.dist-info → validmind-2.1.1.dist-info}/WHEEL +0 -0
  108. {validmind-2.0.7.dist-info → validmind-2.1.1.dist-info}/entry_points.txt +0 -0
validmind/__init__.py CHANGED
@@ -60,8 +60,7 @@ from .client import ( # noqa: E402
60
60
  run_documentation_tests,
61
61
  run_test_suite,
62
62
  )
63
- from .tests.decorator import metric
64
- from .unit_metrics import run_metric
63
+ from .tests.decorator import metric, tags, tasks
65
64
  from .utils import run_async # noqa: E402
66
65
 
67
66
 
@@ -111,6 +110,8 @@ __all__ = [ # noqa
111
110
  "reload",
112
111
  "run_documentation_tests",
113
112
  "run_test_suite",
113
+ "tags",
114
+ "tasks",
114
115
  "tests",
115
116
  "test_suites",
116
117
  "vm_models",
@@ -119,5 +120,4 @@ __all__ = [ # noqa
119
120
  "log_figure",
120
121
  "log_metrics",
121
122
  "log_test_results",
122
- "run_metric",
123
123
  ]
validmind/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.0.7"
1
+ __version__ = "2.1.1"
validmind/ai.py CHANGED
@@ -7,8 +7,6 @@ import os
7
7
 
8
8
  from openai import AzureOpenAI, OpenAI
9
9
 
10
- from .utils import clean_docstring
11
-
12
10
  SYSTEM_PROMPT = """
13
11
  You are an expert data scientist and MRM specialist tasked with providing concise and'
14
12
  objective insights based on the results of quantitative model or dataset analysis.
@@ -23,22 +21,20 @@ Your analysis will act as the description of the result in the model documentati
23
21
 
24
22
  Avoid long sentences and complex vocabulary.
25
23
  Structure the response clearly and logically.
26
- Use Markdown syntax to format the response.
24
+ Use valid Markdown syntax to format the response (tables are supported).
27
25
  Use the Test ID that is provided to form the Test Name e.g. "ClassImbalance" -> "Class Imbalance".
28
- Use the following format for the response:
26
+ Use the following format for the response (feel free to modify slightly if necessary):
29
27
  ```
30
28
  **<Test Name>** <continue to explain what it does in detail>...
31
29
 
32
30
  The results of this test <detailed explanation of the results>...
33
31
 
34
- In summary the following key insights can be gained from this <Test Type>
32
+ In summary the following key insights can be gained:
35
33
 
36
34
  - **<key insight 1 - title>**: <explanation of key insight 1>
37
35
  - ...<continue with any other key insights using the same format>
38
36
  ```
39
37
  It is very important that the text is nicely formatted and contains enough information to be useful to the user as documentation.
40
-
41
- - use valid markdown syntax: make sure to have two newlines between paragraphs and before bullet points etc.
42
38
  """.strip()
43
39
  USER_PROMPT = """
44
40
  Test ID: {test_name}
@@ -71,7 +67,7 @@ def __get_client_and_model():
71
67
 
72
68
  if "OPENAI_API_KEY" in os.environ:
73
69
  __client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
74
- __model = os.environ.get("VM_OPENAI_MODEL", "gpt-4-turbo-preview")
70
+ __model = os.environ.get("VM_OPENAI_MODEL", "gpt-4-turbo")
75
71
 
76
72
  elif "AZURE_OPENAI_KEY" in os.environ:
77
73
  if "AZURE_OPENAI_ENDPOINT" not in os.environ:
@@ -111,7 +107,7 @@ class DescriptionFuture:
111
107
 
112
108
  def get_description(self):
113
109
  # This will block until the future is completed
114
- return clean_docstring(self._future.result())
110
+ return self._future.result()
115
111
 
116
112
 
117
113
  def generate_description_async(
@@ -132,7 +128,7 @@ def generate_description_async(
132
128
  raise ValueError("No results, summary or figures provided")
133
129
 
134
130
  response = client.chat.completions.create(
135
- model="gpt-4-1106-vision-preview",
131
+ model="gpt-4-turbo",
136
132
  messages=[
137
133
  {"role": "system", "content": SYSTEM_PROMPT},
138
134
  {
@@ -160,7 +156,7 @@ def generate_description_async(
160
156
  )
161
157
  else:
162
158
  response = client.chat.completions.create(
163
- model="gpt-4-turbo-preview",
159
+ model="gpt-4-turbo",
164
160
  messages=[
165
161
  {"role": "system", "content": SYSTEM_PROMPT},
166
162
  {
validmind/api_client.py CHANGED
@@ -16,6 +16,7 @@ from io import BytesIO
16
16
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
17
 
18
18
  import aiohttp
19
+ import mistune
19
20
  import requests
20
21
  from aiohttp import FormData
21
22
 
@@ -294,32 +295,33 @@ async def log_figures(figures: List[Figure]) -> Dict[str, Any]:
294
295
  Returns:
295
296
  dict: The response from the API
296
297
  """
297
- if client_config.can_log_figures(): # check if the backend supports batch logging
298
- try:
299
- data = {}
300
- files = {}
301
- for figure in figures:
302
- data.update(
303
- {f"{k}-{figure.key}": v for k, v in figure.serialize().items()}
304
- )
305
- files.update(
306
- {
307
- f"{k}-{figure.key}": v
308
- for k, v in figure.serialize_files().items()
309
- }
310
- )
311
-
312
- return await _post(
313
- "log_figures",
314
- data=data,
315
- files=files,
316
- )
317
- except Exception as e:
318
- logger.error("Error logging figures to ValidMind API")
319
- raise e
320
-
321
- else:
322
- return await asyncio.gather(*[log_figure(figure) for figure in figures])
298
+ # this actually slows things down - better to log them in parallel
299
+ # if client_config.can_log_figures(): # check if the backend supports batch logging
300
+ # try:
301
+ # data = {}
302
+ # files = {}
303
+ # for figure in figures:
304
+ # data.update(
305
+ # {f"{k}-{figure.key}": v for k, v in figure.serialize().items()}
306
+ # )
307
+ # files.update(
308
+ # {
309
+ # f"{k}-{figure.key}": v
310
+ # for k, v in figure.serialize_files().items()
311
+ # }
312
+ # )
313
+
314
+ # return await _post(
315
+ # "log_figures",
316
+ # data=data,
317
+ # files=files,
318
+ # )
319
+ # except Exception as e:
320
+ # logger.error("Error logging figures to ValidMind API")
321
+ # raise e
322
+
323
+ # else:
324
+ return await asyncio.gather(*[log_figure(figure) for figure in figures])
323
325
 
324
326
 
325
327
  async def log_metadata(
@@ -342,7 +344,7 @@ async def log_metadata(
342
344
  """
343
345
  metadata_dict = {"content_id": content_id}
344
346
  if text is not None:
345
- metadata_dict["text"] = text
347
+ metadata_dict["text"] = mistune.html(text)
346
348
  if _json is not None:
347
349
  metadata_dict["json"] = _json
348
350
 
validmind/client.py CHANGED
@@ -61,8 +61,13 @@ def init_dataset(
61
61
  """
62
62
  Initializes a VM Dataset, which can then be passed to other functions
63
63
  that can perform additional analysis and tests on the data. This function
64
- also ensures we are reading a valid dataset type. We only support Pandas
65
- DataFrames at the moment.
64
+ also ensures we are reading a valid dataset type.
65
+
66
+ The following dataset types are supported:
67
+ - Pandas DataFrame
68
+ - Polars DataFrame
69
+ - Numpy ndarray
70
+ - Torch TensorDataset
66
71
 
67
72
  Args:
68
73
  dataset : dataset from various python libraries
@@ -380,7 +385,7 @@ def preview_template():
380
385
 
381
386
 
382
387
  def run_documentation_tests(
383
- section=None, send=True, fail_fast=False, inputs=None, **kwargs
388
+ section=None, send=True, fail_fast=False, inputs=None, config=None, **kwargs
384
389
  ):
385
390
  """Collect and run all the tests associated with a template
386
391
 
@@ -393,6 +398,7 @@ def run_documentation_tests(
393
398
  send (bool, optional): Whether to send the results to the ValidMind API. Defaults to True.
394
399
  fail_fast (bool, optional): Whether to stop running tests after the first failure. Defaults to False.
395
400
  inputs (dict, optional): A dictionary of test inputs to pass to the TestSuite
401
+ config: A dictionary of test parameters to override the defaults
396
402
  **kwargs: backwards compatibility for passing in test inputs using keyword arguments
397
403
 
398
404
  Returns:
@@ -421,6 +427,7 @@ def run_documentation_tests(
421
427
  send=send,
422
428
  fail_fast=fail_fast,
423
429
  inputs=inputs,
430
+ config=config,
424
431
  **kwargs,
425
432
  )
426
433
  test_suites[_section] = test_suite
@@ -0,0 +1,11 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ """
6
+ Entrypoint for credit risk datasets.
7
+ """
8
+
9
+ __all__ = [
10
+ "lending_club",
11
+ ]
@@ -0,0 +1,394 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import os
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import scorecardpy as sc
10
+ import statsmodels.api as sm
11
+ from sklearn.model_selection import train_test_split
12
+
13
+ current_path = os.path.dirname(os.path.abspath(__file__))
14
+ dataset_path = os.path.join(current_path, "datasets")
15
+
16
+ # URLs or file paths for online and offline data
17
+ online_data_file = "https://vmai.s3.us-west-1.amazonaws.com/datasets/lending_club_loan_data_2007_2014.csv"
18
+ offline_data_file = os.path.join(
19
+ dataset_path, "lending_club_loan_data_2007_2014_clean.csv.gz"
20
+ )
21
+
22
+ target_column = "loan_status"
23
+
24
+ drop_columns = [
25
+ "Unnamed: 0",
26
+ "id",
27
+ "member_id",
28
+ "funded_amnt",
29
+ "emp_title",
30
+ "url",
31
+ "desc",
32
+ "application_type",
33
+ "title",
34
+ "zip_code",
35
+ "delinq_2yrs",
36
+ "mths_since_last_delinq",
37
+ "mths_since_last_record",
38
+ "mths_since_last_major_derog",
39
+ "revol_bal",
40
+ "total_rec_prncp",
41
+ "total_rec_late_fee",
42
+ "recoveries",
43
+ "out_prncp_inv",
44
+ "out_prncp",
45
+ "collection_recovery_fee",
46
+ "next_pymnt_d",
47
+ "initial_list_status",
48
+ "pub_rec",
49
+ "collections_12_mths_ex_med",
50
+ "policy_code",
51
+ "acc_now_delinq",
52
+ "pymnt_plan",
53
+ "tot_coll_amt",
54
+ "tot_cur_bal",
55
+ "total_rev_hi_lim",
56
+ "last_pymnt_d",
57
+ "last_credit_pull_d",
58
+ "earliest_cr_line",
59
+ "issue_d",
60
+ "addr_state",
61
+ "dti",
62
+ "revol_util",
63
+ "total_pymnt_inv",
64
+ "inq_last_6mths",
65
+ "total_rec_int",
66
+ "last_pymnt_amnt",
67
+ ]
68
+
69
+ drop_features = [
70
+ "loan_amnt",
71
+ "funded_amnt_inv",
72
+ "total_pymnt",
73
+ ]
74
+
75
+ categorical_variables = [
76
+ "term",
77
+ "grade",
78
+ "sub_grade",
79
+ "emp_length",
80
+ "home_ownership",
81
+ "verification_status",
82
+ "purpose",
83
+ ]
84
+
85
+ breaks_adj = {
86
+ "loan_amnt": [5000, 10000, 15000, 20000, 25000],
87
+ "int_rate": [10, 15, 20],
88
+ "annual_inc": [50000, 100000, 150000],
89
+ }
90
+
91
+ score_params = {
92
+ "target_score": 600,
93
+ "target_odds": 50,
94
+ "pdo": 20,
95
+ }
96
+
97
+
98
+ def load_data(source="online"):
99
+ """
100
+ Load data from either an online source or offline files, automatically dropping specified columns for offline data.
101
+
102
+ :param source: 'online' for online data, 'offline' for offline files. Defaults to 'online'.
103
+ :return: DataFrame containing the loaded data.
104
+ """
105
+
106
+ if source == "online":
107
+ print(f"Loading data from an online source: {online_data_file}")
108
+ df = pd.read_csv(online_data_file)
109
+ df = _clean_data(df)
110
+
111
+ elif source == "offline":
112
+ print(f"Loading data from an offline .gz file: {offline_data_file}")
113
+ # Since we know the offline_data_file path ends with '.zip', we replace it with '.csv.gz'
114
+ gzip_file_path = offline_data_file.replace(".zip", ".csv.gz")
115
+ print(f"Attempting to read from .gz file: {gzip_file_path}")
116
+ # Read the CSV file directly from the .gz archive
117
+ df = pd.read_csv(gzip_file_path, compression="gzip")
118
+ print("Data loaded successfully.")
119
+ else:
120
+ raise ValueError("Invalid source specified. Choose 'online' or 'offline'.")
121
+
122
+ print(
123
+ f"Rows: {df.shape[0]}, Columns: {df.shape[1]}, Missing values: {df.isnull().sum().sum()}"
124
+ )
125
+ return df
126
+
127
+
128
+ def _clean_data(df):
129
+ df = df.copy()
130
+
131
+ # Drop columns not relevant for application scorecards
132
+ df = df.drop(columns=drop_columns)
133
+
134
+ # Drop rows with missing target values
135
+ df.dropna(subset=[target_column], inplace=True)
136
+ print("Dropping rows with missing target values:")
137
+ print(
138
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
139
+ )
140
+
141
+ # Drop columns with more than N percent missing values
142
+ missing_values = df.isnull().mean()
143
+ df = df.loc[:, missing_values < 0.7]
144
+ print("Dropping columns with more than 70% missing values:")
145
+ print(
146
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
147
+ )
148
+
149
+ # Drop columns with only one unique value
150
+ unique_values = df.nunique()
151
+ df = df.loc[:, unique_values > 1]
152
+ print("Dropping columns with only one unique value:")
153
+ print(
154
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
155
+ )
156
+
157
+ # Define the target variable for the model, representing loan default status.
158
+ df[target_column] = df[target_column].map({"Fully Paid": 0, "Charged Off": 1})
159
+
160
+ # Drop rows with NaN in target_column after mapping
161
+ df.dropna(subset=[target_column], inplace=True)
162
+ print("Dropping rows with missing target values:")
163
+ print(
164
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
165
+ )
166
+
167
+ return df
168
+
169
+
170
+ def preprocess(df):
171
+ df = df.copy()
172
+
173
+ # Convert the target variable to integer type for modeling.
174
+ df[target_column] = df[target_column].astype(int)
175
+
176
+ # Keep rows where purpose is 'debt_consolidation' or 'credit_card'
177
+ df = df[df["purpose"].isin(["debt_consolidation", "credit_card"])]
178
+ print("Filtering 'purpose' to 'debt_consolidation' and 'credit_card':")
179
+ print(
180
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
181
+ )
182
+
183
+ # Remove rows where grade is 'F' or 'G'
184
+ df = df[~df["grade"].isin(["F", "G"])]
185
+ print("Filtering out 'grade' F and G:")
186
+ print(
187
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
188
+ )
189
+
190
+ # Remove rows where sub_grade starts with 'F' or 'G'
191
+ df = df[~df["sub_grade"].str.startswith(("F", "G"))]
192
+ print("Filtering out 'sub_grade' F and G:")
193
+ print(
194
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
195
+ )
196
+
197
+ # Remove rows where home_ownership is 'OTHER', 'NONE', or 'ANY'
198
+ df = df[~df["home_ownership"].isin(["OTHER", "NONE", "ANY"])]
199
+ print("Filtering out 'home_ownership' OTHER, NONE, ANY:")
200
+ print(
201
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
202
+ )
203
+
204
+ # Drop features that are not useful for modeling
205
+ df.drop(drop_features, axis=1, inplace=True)
206
+ print("Dropping specified features:")
207
+ print(
208
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
209
+ )
210
+
211
+ # Drop rows with missing values
212
+ df.dropna(inplace=True)
213
+ print("Dropping rows with any missing values:")
214
+ print(
215
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
216
+ )
217
+
218
+ # Preprocess emp_length column
219
+ df = _preprocess_emp_length(df)
220
+
221
+ # Preprocess term column
222
+ df = _preprocess_term(df)
223
+
224
+ return df
225
+
226
+
227
+ def _preprocess_term(df):
228
+ df = df.copy()
229
+
230
+ # Remove ' months' and convert to integer
231
+ df["term"] = df["term"].str.replace(" months", "").astype(object)
232
+
233
+ return df
234
+
235
+
236
+ def _preprocess_emp_length(df):
237
+ df = df.copy()
238
+
239
+ # Mapping string values to numbers
240
+ emp_length_map = {
241
+ "10+ years": 10,
242
+ "< 1 year": 0,
243
+ "1 year": 1,
244
+ "2 years": 2,
245
+ "3 years": 3,
246
+ "4 years": 4,
247
+ "5 years": 5,
248
+ "6 years": 6,
249
+ "7 years": 7,
250
+ "8 years": 8,
251
+ "9 years": 9,
252
+ }
253
+
254
+ # Apply the mapping to the emp_length column
255
+ df["emp_length"] = df["emp_length"].map(emp_length_map).astype(object)
256
+
257
+ # Drop rows where emp_length is NaN after mapping
258
+ # df.dropna(subset=["emp_length"], inplace=True)
259
+
260
+ return df
261
+
262
+
263
+ def feature_engineering(df):
264
+ df = df.copy()
265
+
266
+ # WoE encoding of numerical and categorical features
267
+ df = woe_encoding(df)
268
+
269
+ print(
270
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
271
+ )
272
+
273
+ return df
274
+
275
+
276
+ def woe_encoding(df):
277
+ df = df.copy()
278
+
279
+ woe = _woebin(df)
280
+ bins = _woe_to_bins(woe)
281
+
282
+ # Make sure we don't transform the target column
283
+ if target_column in bins:
284
+ del bins[target_column]
285
+ print(f"Excluded {target_column} from WoE transformation.")
286
+
287
+ # Apply the WoE transformation
288
+ df = sc.woebin_ply(df, bins=bins)
289
+
290
+ print("Successfully converted features to WoE values.")
291
+
292
+ return df
293
+
294
+
295
+ def _woe_to_bins(woe):
296
+ # Select and rename columns
297
+ transformed_df = woe[
298
+ [
299
+ "variable",
300
+ "bin",
301
+ "count",
302
+ "count_distr",
303
+ "good",
304
+ "bad",
305
+ "badprob",
306
+ "woe",
307
+ "bin_iv",
308
+ "total_iv",
309
+ ]
310
+ ].copy()
311
+ transformed_df.rename(columns={"bin_iv": "total_iv"}, inplace=True)
312
+
313
+ # Create 'is_special_values' column (assuming there are no special values)
314
+ transformed_df["is_special_values"] = False
315
+
316
+ # Transform 'bin' column into interval format and store it in 'breaks' column
317
+ transformed_df["breaks"] = transformed_df["bin"].apply(
318
+ lambda x: "[-inf, %s)" % x if isinstance(x, float) else "[%s, inf)" % x
319
+ )
320
+
321
+ # Group by 'variable' to create bins dictionary
322
+ bins = {}
323
+ for variable, group in transformed_df.groupby("variable"):
324
+ bins[variable] = group
325
+
326
+ return bins
327
+
328
+
329
+ def _woebin(df):
330
+ """
331
+ This function performs automatic binning using WoE.
332
+ df: A pandas dataframe
333
+ target_column: The target variable in quotes, e.g. 'loan_status'
334
+ """
335
+
336
+ non_numeric_cols = df.select_dtypes(exclude=["int64", "float64"]).columns
337
+ df[non_numeric_cols] = df[non_numeric_cols].astype(str)
338
+
339
+ try:
340
+ print(
341
+ f"Performing binning with breaks_adj: {breaks_adj}"
342
+ ) # print the breaks_adj being used
343
+ bins = sc.woebin(df, target_column, breaks_list=breaks_adj)
344
+ except Exception as e:
345
+ print("Error during binning: ")
346
+ print(e)
347
+ else:
348
+ bins_df = pd.concat(bins.values(), keys=bins.keys())
349
+ bins_df.reset_index(inplace=True)
350
+ bins_df.drop(columns=["variable"], inplace=True)
351
+ bins_df.rename(columns={"level_0": "variable"}, inplace=True)
352
+
353
+ bins_df["bin_number"] = bins_df.groupby("variable").cumcount()
354
+
355
+ return bins_df
356
+
357
+
358
+ def split(df, add_constant=False):
359
+ df = df.copy()
360
+
361
+ # Splitting the dataset into training and test sets
362
+ train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
363
+
364
+ if add_constant:
365
+ # Add a constant to the model for both training and testing datasets
366
+ train_df = sm.add_constant(train_df)
367
+ test_df = sm.add_constant(test_df)
368
+
369
+ # Calculate and print details for the training dataset
370
+ print("After splitting the dataset into training and test sets:")
371
+ print(
372
+ f"Training Dataset:\nRows: {train_df.shape[0]}\nColumns: {train_df.shape[1]}\nMissing values: {train_df.isnull().sum().sum()}\n"
373
+ )
374
+
375
+ # Calculate and print details for the test dataset
376
+ print(
377
+ f"Test Dataset:\nRows: {test_df.shape[0]}\nColumns: {test_df.shape[1]}\nMissing values: {test_df.isnull().sum().sum()}\n"
378
+ )
379
+
380
+ return train_df, test_df
381
+
382
+
383
+ def compute_scores(probabilities):
384
+
385
+ target_score = score_params["target_score"]
386
+ target_odds = score_params["target_odds"]
387
+ pdo = score_params["pdo"]
388
+
389
+ factor = pdo / np.log(2)
390
+ offset = target_score - (factor * np.log(target_odds))
391
+
392
+ scores = offset + factor * np.log(probabilities / (1 - probabilities))
393
+
394
+ return scores
validmind/logging.py CHANGED
@@ -68,10 +68,17 @@ def get_logger(name="validmind", log_level=None):
68
68
  logger = logging.getLogger(name)
69
69
  logger.setLevel(log_level or _get_log_level())
70
70
 
71
- # Check if the handler is already added
72
- if not any(isinstance(h, type(handler)) for h in logger.handlers):
71
+ # Clear existing handlers if any (or refine the existing logic as necessary)
72
+ # TODO: lets add some better handler management
73
+ if not any(
74
+ isinstance(h, type(handler)) and h.formatter._fmt == formatter._fmt
75
+ for h in logger.handlers
76
+ ):
73
77
  logger.addHandler(handler)
74
78
 
79
+ # Prevent logger from propagating to root logger
80
+ logger.propagate = False
81
+
75
82
  return logger
76
83
 
77
84
 
validmind/template.py CHANGED
@@ -4,9 +4,9 @@
4
4
 
5
5
  from pprint import pformat
6
6
 
7
+ import mistune
7
8
  from IPython.display import display
8
9
  from ipywidgets import HTML, Accordion, VBox
9
- from markdown import markdown
10
10
 
11
11
  from .html_templates.content_blocks import (
12
12
  failed_content_block_html,
@@ -75,7 +75,7 @@ def _create_content_widget(content):
75
75
  HTML(
76
76
  test_content_block_html.format(
77
77
  title=test_deets["Name"],
78
- description=markdown(test_deets["Description"]),
78
+ description=mistune.html(test_deets["Description"]),
79
79
  required_inputs=", ".join(
80
80
  test_deets["Required Inputs"] or ["None"]
81
81
  ),
@@ -5,6 +5,8 @@
5
5
  """
6
6
  Entrypoint for test suites.
7
7
  """
8
+ from inspect import getdoc
9
+
8
10
  import pandas as pd
9
11
 
10
12
  from ..logging import get_logger
@@ -139,7 +141,7 @@ def list_suites(pretty: bool = True):
139
141
  {
140
142
  "ID": suite_id,
141
143
  "Name": test_suite.__name__,
142
- "Description": test_suite.__doc__.strip(),
144
+ "Description": getdoc(test_suite).strip(),
143
145
  "Tests": ", ".join(_get_test_suite_test_ids(test_suite)),
144
146
  }
145
147
  )
@@ -167,7 +169,7 @@ def describe_suite(test_suite_id: str, verbose=False):
167
169
  {
168
170
  "ID": test_suite_id,
169
171
  "Name": test_suite.__name__,
170
- "Description": test_suite.__doc__.strip(),
172
+ "Description": getdoc(test_suite).strip(),
171
173
  "Tests": ", ".join(_get_test_suite_test_ids(test_suite)),
172
174
  }
173
175
  ]