validmind 2.0.7__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +3 -3
- validmind/__version__.py +1 -1
- validmind/ai.py +7 -11
- validmind/api_client.py +29 -27
- validmind/client.py +10 -3
- validmind/datasets/credit_risk/__init__.py +11 -0
- validmind/datasets/credit_risk/datasets/lending_club_loan_data_2007_2014_clean.csv.gz +0 -0
- validmind/datasets/credit_risk/lending_club.py +394 -0
- validmind/logging.py +9 -2
- validmind/template.py +2 -2
- validmind/test_suites/__init__.py +4 -2
- validmind/tests/__init__.py +97 -50
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +3 -1
- validmind/tests/data_validation/PiTCreditScoresHistogram.py +1 -1
- validmind/tests/data_validation/ScatterPlot.py +8 -2
- validmind/tests/decorator.py +138 -14
- validmind/tests/model_validation/BertScore.py +1 -1
- validmind/tests/model_validation/BertScoreAggregate.py +1 -1
- validmind/tests/model_validation/BleuScore.py +1 -1
- validmind/tests/model_validation/ClusterSizeDistribution.py +1 -1
- validmind/tests/model_validation/ContextualRecall.py +1 -1
- validmind/tests/model_validation/FeaturesAUC.py +110 -0
- validmind/tests/model_validation/MeteorScore.py +1 -1
- validmind/tests/model_validation/RegardHistogram.py +1 -1
- validmind/tests/model_validation/RegardScore.py +1 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +127 -0
- validmind/tests/model_validation/RougeMetrics.py +1 -1
- validmind/tests/model_validation/RougeMetricsAggregate.py +1 -1
- validmind/tests/model_validation/SelfCheckNLIScore.py +1 -1
- validmind/tests/model_validation/TokenDisparity.py +1 -1
- validmind/tests/model_validation/ToxicityHistogram.py +1 -1
- validmind/tests/model_validation/ToxicityScore.py +1 -1
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +1 -3
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +1 -1
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +1 -1
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +15 -18
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +1 -1
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +21 -3
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +1 -1
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +1 -1
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +1 -1
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +5 -4
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +2 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +6 -12
- validmind/tests/model_validation/sklearn/RegressionErrors.py +2 -2
- validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +6 -4
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +2 -2
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +33 -3
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +1 -1
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +2 -2
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +2 -2
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +140 -0
- validmind/tests/model_validation/statsmodels/GINITable.py +22 -45
- validmind/tests/model_validation/statsmodels/{LogisticRegPredictionHistogram.py → PredictionProbabilitiesHistogram.py} +67 -92
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +2 -2
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -2
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +128 -0
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +70 -103
- validmind/tests/test_providers.py +14 -124
- validmind/unit_metrics/__init__.py +76 -69
- validmind/unit_metrics/classification/sklearn/Accuracy.py +14 -0
- validmind/unit_metrics/classification/sklearn/F1.py +13 -0
- validmind/unit_metrics/classification/sklearn/Precision.py +13 -0
- validmind/unit_metrics/classification/sklearn/ROC_AUC.py +13 -0
- validmind/unit_metrics/classification/sklearn/Recall.py +13 -0
- validmind/unit_metrics/composite.py +24 -71
- validmind/unit_metrics/regression/GiniCoefficient.py +20 -26
- validmind/unit_metrics/regression/HuberLoss.py +12 -16
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +18 -24
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +7 -13
- validmind/unit_metrics/regression/MeanBiasDeviation.py +5 -14
- validmind/unit_metrics/regression/QuantileLoss.py +6 -16
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +12 -18
- validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +6 -15
- validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +5 -14
- validmind/unit_metrics/regression/sklearn/RSquaredScore.py +6 -15
- validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +11 -14
- validmind/utils.py +18 -45
- validmind/vm_models/__init__.py +0 -2
- validmind/vm_models/dataset.py +255 -16
- validmind/vm_models/test/metric.py +1 -2
- validmind/vm_models/test/result_wrapper.py +12 -13
- validmind/vm_models/test/test.py +2 -1
- validmind/vm_models/test/threshold_test.py +1 -2
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test_suite.py +2 -1
- {validmind-2.0.7.dist-info → validmind-2.1.1.dist-info}/METADATA +10 -6
- {validmind-2.0.7.dist-info → validmind-2.1.1.dist-info}/RECORD +97 -96
- validmind/tests/__types__.py +0 -62
- validmind/tests/model_validation/statsmodels/LogRegressionConfusionMatrix.py +0 -128
- validmind/tests/model_validation/statsmodels/LogisticRegCumulativeProb.py +0 -172
- validmind/tests/model_validation/statsmodels/ScorecardBucketHistogram.py +0 -181
- validmind/tests/model_validation/statsmodels/ScorecardProbabilitiesHistogram.py +0 -175
- validmind/unit_metrics/sklearn/classification/Accuracy.py +0 -22
- validmind/unit_metrics/sklearn/classification/F1.py +0 -24
- validmind/unit_metrics/sklearn/classification/Precision.py +0 -24
- validmind/unit_metrics/sklearn/classification/ROC_AUC.py +0 -22
- validmind/unit_metrics/sklearn/classification/Recall.py +0 -22
- validmind/vm_models/test/unit_metric.py +0 -88
- {validmind-2.0.7.dist-info → validmind-2.1.1.dist-info}/LICENSE +0 -0
- {validmind-2.0.7.dist-info → validmind-2.1.1.dist-info}/WHEEL +0 -0
- {validmind-2.0.7.dist-info → validmind-2.1.1.dist-info}/entry_points.txt +0 -0
validmind/__init__.py
CHANGED
@@ -60,8 +60,7 @@ from .client import ( # noqa: E402
|
|
60
60
|
run_documentation_tests,
|
61
61
|
run_test_suite,
|
62
62
|
)
|
63
|
-
from .tests.decorator import metric
|
64
|
-
from .unit_metrics import run_metric
|
63
|
+
from .tests.decorator import metric, tags, tasks
|
65
64
|
from .utils import run_async # noqa: E402
|
66
65
|
|
67
66
|
|
@@ -111,6 +110,8 @@ __all__ = [ # noqa
|
|
111
110
|
"reload",
|
112
111
|
"run_documentation_tests",
|
113
112
|
"run_test_suite",
|
113
|
+
"tags",
|
114
|
+
"tasks",
|
114
115
|
"tests",
|
115
116
|
"test_suites",
|
116
117
|
"vm_models",
|
@@ -119,5 +120,4 @@ __all__ = [ # noqa
|
|
119
120
|
"log_figure",
|
120
121
|
"log_metrics",
|
121
122
|
"log_test_results",
|
122
|
-
"run_metric",
|
123
123
|
]
|
validmind/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "2.
|
1
|
+
__version__ = "2.1.1"
|
validmind/ai.py
CHANGED
@@ -7,8 +7,6 @@ import os
|
|
7
7
|
|
8
8
|
from openai import AzureOpenAI, OpenAI
|
9
9
|
|
10
|
-
from .utils import clean_docstring
|
11
|
-
|
12
10
|
SYSTEM_PROMPT = """
|
13
11
|
You are an expert data scientist and MRM specialist tasked with providing concise and'
|
14
12
|
objective insights based on the results of quantitative model or dataset analysis.
|
@@ -23,22 +21,20 @@ Your analysis will act as the description of the result in the model documentati
|
|
23
21
|
|
24
22
|
Avoid long sentences and complex vocabulary.
|
25
23
|
Structure the response clearly and logically.
|
26
|
-
Use Markdown syntax to format the response.
|
24
|
+
Use valid Markdown syntax to format the response (tables are supported).
|
27
25
|
Use the Test ID that is provided to form the Test Name e.g. "ClassImbalance" -> "Class Imbalance".
|
28
|
-
Use the following format for the response:
|
26
|
+
Use the following format for the response (feel free to modify slightly if necessary):
|
29
27
|
```
|
30
28
|
**<Test Name>** <continue to explain what it does in detail>...
|
31
29
|
|
32
30
|
The results of this test <detailed explanation of the results>...
|
33
31
|
|
34
|
-
In summary the following key insights can be gained
|
32
|
+
In summary the following key insights can be gained:
|
35
33
|
|
36
34
|
- **<key insight 1 - title>**: <explanation of key insight 1>
|
37
35
|
- ...<continue with any other key insights using the same format>
|
38
36
|
```
|
39
37
|
It is very important that the text is nicely formatted and contains enough information to be useful to the user as documentation.
|
40
|
-
|
41
|
-
- use valid markdown syntax: make sure to have two newlines between paragraphs and before bullet points etc.
|
42
38
|
""".strip()
|
43
39
|
USER_PROMPT = """
|
44
40
|
Test ID: {test_name}
|
@@ -71,7 +67,7 @@ def __get_client_and_model():
|
|
71
67
|
|
72
68
|
if "OPENAI_API_KEY" in os.environ:
|
73
69
|
__client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
74
|
-
__model = os.environ.get("VM_OPENAI_MODEL", "gpt-4-turbo
|
70
|
+
__model = os.environ.get("VM_OPENAI_MODEL", "gpt-4-turbo")
|
75
71
|
|
76
72
|
elif "AZURE_OPENAI_KEY" in os.environ:
|
77
73
|
if "AZURE_OPENAI_ENDPOINT" not in os.environ:
|
@@ -111,7 +107,7 @@ class DescriptionFuture:
|
|
111
107
|
|
112
108
|
def get_description(self):
|
113
109
|
# This will block until the future is completed
|
114
|
-
return
|
110
|
+
return self._future.result()
|
115
111
|
|
116
112
|
|
117
113
|
def generate_description_async(
|
@@ -132,7 +128,7 @@ def generate_description_async(
|
|
132
128
|
raise ValueError("No results, summary or figures provided")
|
133
129
|
|
134
130
|
response = client.chat.completions.create(
|
135
|
-
model="gpt-4-
|
131
|
+
model="gpt-4-turbo",
|
136
132
|
messages=[
|
137
133
|
{"role": "system", "content": SYSTEM_PROMPT},
|
138
134
|
{
|
@@ -160,7 +156,7 @@ def generate_description_async(
|
|
160
156
|
)
|
161
157
|
else:
|
162
158
|
response = client.chat.completions.create(
|
163
|
-
model="gpt-4-turbo
|
159
|
+
model="gpt-4-turbo",
|
164
160
|
messages=[
|
165
161
|
{"role": "system", "content": SYSTEM_PROMPT},
|
166
162
|
{
|
validmind/api_client.py
CHANGED
@@ -16,6 +16,7 @@ from io import BytesIO
|
|
16
16
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
17
17
|
|
18
18
|
import aiohttp
|
19
|
+
import mistune
|
19
20
|
import requests
|
20
21
|
from aiohttp import FormData
|
21
22
|
|
@@ -294,32 +295,33 @@ async def log_figures(figures: List[Figure]) -> Dict[str, Any]:
|
|
294
295
|
Returns:
|
295
296
|
dict: The response from the API
|
296
297
|
"""
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
298
|
+
# this actually slows things down - better to log them in parallel
|
299
|
+
# if client_config.can_log_figures(): # check if the backend supports batch logging
|
300
|
+
# try:
|
301
|
+
# data = {}
|
302
|
+
# files = {}
|
303
|
+
# for figure in figures:
|
304
|
+
# data.update(
|
305
|
+
# {f"{k}-{figure.key}": v for k, v in figure.serialize().items()}
|
306
|
+
# )
|
307
|
+
# files.update(
|
308
|
+
# {
|
309
|
+
# f"{k}-{figure.key}": v
|
310
|
+
# for k, v in figure.serialize_files().items()
|
311
|
+
# }
|
312
|
+
# )
|
313
|
+
|
314
|
+
# return await _post(
|
315
|
+
# "log_figures",
|
316
|
+
# data=data,
|
317
|
+
# files=files,
|
318
|
+
# )
|
319
|
+
# except Exception as e:
|
320
|
+
# logger.error("Error logging figures to ValidMind API")
|
321
|
+
# raise e
|
322
|
+
|
323
|
+
# else:
|
324
|
+
return await asyncio.gather(*[log_figure(figure) for figure in figures])
|
323
325
|
|
324
326
|
|
325
327
|
async def log_metadata(
|
@@ -342,7 +344,7 @@ async def log_metadata(
|
|
342
344
|
"""
|
343
345
|
metadata_dict = {"content_id": content_id}
|
344
346
|
if text is not None:
|
345
|
-
metadata_dict["text"] = text
|
347
|
+
metadata_dict["text"] = mistune.html(text)
|
346
348
|
if _json is not None:
|
347
349
|
metadata_dict["json"] = _json
|
348
350
|
|
validmind/client.py
CHANGED
@@ -61,8 +61,13 @@ def init_dataset(
|
|
61
61
|
"""
|
62
62
|
Initializes a VM Dataset, which can then be passed to other functions
|
63
63
|
that can perform additional analysis and tests on the data. This function
|
64
|
-
also ensures we are reading a valid dataset type.
|
65
|
-
|
64
|
+
also ensures we are reading a valid dataset type.
|
65
|
+
|
66
|
+
The following dataset types are supported:
|
67
|
+
- Pandas DataFrame
|
68
|
+
- Polars DataFrame
|
69
|
+
- Numpy ndarray
|
70
|
+
- Torch TensorDataset
|
66
71
|
|
67
72
|
Args:
|
68
73
|
dataset : dataset from various python libraries
|
@@ -380,7 +385,7 @@ def preview_template():
|
|
380
385
|
|
381
386
|
|
382
387
|
def run_documentation_tests(
|
383
|
-
section=None, send=True, fail_fast=False, inputs=None, **kwargs
|
388
|
+
section=None, send=True, fail_fast=False, inputs=None, config=None, **kwargs
|
384
389
|
):
|
385
390
|
"""Collect and run all the tests associated with a template
|
386
391
|
|
@@ -393,6 +398,7 @@ def run_documentation_tests(
|
|
393
398
|
send (bool, optional): Whether to send the results to the ValidMind API. Defaults to True.
|
394
399
|
fail_fast (bool, optional): Whether to stop running tests after the first failure. Defaults to False.
|
395
400
|
inputs (dict, optional): A dictionary of test inputs to pass to the TestSuite
|
401
|
+
config: A dictionary of test parameters to override the defaults
|
396
402
|
**kwargs: backwards compatibility for passing in test inputs using keyword arguments
|
397
403
|
|
398
404
|
Returns:
|
@@ -421,6 +427,7 @@ def run_documentation_tests(
|
|
421
427
|
send=send,
|
422
428
|
fail_fast=fail_fast,
|
423
429
|
inputs=inputs,
|
430
|
+
config=config,
|
424
431
|
**kwargs,
|
425
432
|
)
|
426
433
|
test_suites[_section] = test_suite
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
"""
|
6
|
+
Entrypoint for credit risk datasets.
|
7
|
+
"""
|
8
|
+
|
9
|
+
__all__ = [
|
10
|
+
"lending_club",
|
11
|
+
]
|
Binary file
|
@@ -0,0 +1,394 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import os
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import pandas as pd
|
9
|
+
import scorecardpy as sc
|
10
|
+
import statsmodels.api as sm
|
11
|
+
from sklearn.model_selection import train_test_split
|
12
|
+
|
13
|
+
current_path = os.path.dirname(os.path.abspath(__file__))
|
14
|
+
dataset_path = os.path.join(current_path, "datasets")
|
15
|
+
|
16
|
+
# URLs or file paths for online and offline data
|
17
|
+
online_data_file = "https://vmai.s3.us-west-1.amazonaws.com/datasets/lending_club_loan_data_2007_2014.csv"
|
18
|
+
offline_data_file = os.path.join(
|
19
|
+
dataset_path, "lending_club_loan_data_2007_2014_clean.csv.gz"
|
20
|
+
)
|
21
|
+
|
22
|
+
target_column = "loan_status"
|
23
|
+
|
24
|
+
drop_columns = [
|
25
|
+
"Unnamed: 0",
|
26
|
+
"id",
|
27
|
+
"member_id",
|
28
|
+
"funded_amnt",
|
29
|
+
"emp_title",
|
30
|
+
"url",
|
31
|
+
"desc",
|
32
|
+
"application_type",
|
33
|
+
"title",
|
34
|
+
"zip_code",
|
35
|
+
"delinq_2yrs",
|
36
|
+
"mths_since_last_delinq",
|
37
|
+
"mths_since_last_record",
|
38
|
+
"mths_since_last_major_derog",
|
39
|
+
"revol_bal",
|
40
|
+
"total_rec_prncp",
|
41
|
+
"total_rec_late_fee",
|
42
|
+
"recoveries",
|
43
|
+
"out_prncp_inv",
|
44
|
+
"out_prncp",
|
45
|
+
"collection_recovery_fee",
|
46
|
+
"next_pymnt_d",
|
47
|
+
"initial_list_status",
|
48
|
+
"pub_rec",
|
49
|
+
"collections_12_mths_ex_med",
|
50
|
+
"policy_code",
|
51
|
+
"acc_now_delinq",
|
52
|
+
"pymnt_plan",
|
53
|
+
"tot_coll_amt",
|
54
|
+
"tot_cur_bal",
|
55
|
+
"total_rev_hi_lim",
|
56
|
+
"last_pymnt_d",
|
57
|
+
"last_credit_pull_d",
|
58
|
+
"earliest_cr_line",
|
59
|
+
"issue_d",
|
60
|
+
"addr_state",
|
61
|
+
"dti",
|
62
|
+
"revol_util",
|
63
|
+
"total_pymnt_inv",
|
64
|
+
"inq_last_6mths",
|
65
|
+
"total_rec_int",
|
66
|
+
"last_pymnt_amnt",
|
67
|
+
]
|
68
|
+
|
69
|
+
drop_features = [
|
70
|
+
"loan_amnt",
|
71
|
+
"funded_amnt_inv",
|
72
|
+
"total_pymnt",
|
73
|
+
]
|
74
|
+
|
75
|
+
categorical_variables = [
|
76
|
+
"term",
|
77
|
+
"grade",
|
78
|
+
"sub_grade",
|
79
|
+
"emp_length",
|
80
|
+
"home_ownership",
|
81
|
+
"verification_status",
|
82
|
+
"purpose",
|
83
|
+
]
|
84
|
+
|
85
|
+
breaks_adj = {
|
86
|
+
"loan_amnt": [5000, 10000, 15000, 20000, 25000],
|
87
|
+
"int_rate": [10, 15, 20],
|
88
|
+
"annual_inc": [50000, 100000, 150000],
|
89
|
+
}
|
90
|
+
|
91
|
+
score_params = {
|
92
|
+
"target_score": 600,
|
93
|
+
"target_odds": 50,
|
94
|
+
"pdo": 20,
|
95
|
+
}
|
96
|
+
|
97
|
+
|
98
|
+
def load_data(source="online"):
|
99
|
+
"""
|
100
|
+
Load data from either an online source or offline files, automatically dropping specified columns for offline data.
|
101
|
+
|
102
|
+
:param source: 'online' for online data, 'offline' for offline files. Defaults to 'online'.
|
103
|
+
:return: DataFrame containing the loaded data.
|
104
|
+
"""
|
105
|
+
|
106
|
+
if source == "online":
|
107
|
+
print(f"Loading data from an online source: {online_data_file}")
|
108
|
+
df = pd.read_csv(online_data_file)
|
109
|
+
df = _clean_data(df)
|
110
|
+
|
111
|
+
elif source == "offline":
|
112
|
+
print(f"Loading data from an offline .gz file: {offline_data_file}")
|
113
|
+
# Since we know the offline_data_file path ends with '.zip', we replace it with '.csv.gz'
|
114
|
+
gzip_file_path = offline_data_file.replace(".zip", ".csv.gz")
|
115
|
+
print(f"Attempting to read from .gz file: {gzip_file_path}")
|
116
|
+
# Read the CSV file directly from the .gz archive
|
117
|
+
df = pd.read_csv(gzip_file_path, compression="gzip")
|
118
|
+
print("Data loaded successfully.")
|
119
|
+
else:
|
120
|
+
raise ValueError("Invalid source specified. Choose 'online' or 'offline'.")
|
121
|
+
|
122
|
+
print(
|
123
|
+
f"Rows: {df.shape[0]}, Columns: {df.shape[1]}, Missing values: {df.isnull().sum().sum()}"
|
124
|
+
)
|
125
|
+
return df
|
126
|
+
|
127
|
+
|
128
|
+
def _clean_data(df):
|
129
|
+
df = df.copy()
|
130
|
+
|
131
|
+
# Drop columns not relevant for application scorecards
|
132
|
+
df = df.drop(columns=drop_columns)
|
133
|
+
|
134
|
+
# Drop rows with missing target values
|
135
|
+
df.dropna(subset=[target_column], inplace=True)
|
136
|
+
print("Dropping rows with missing target values:")
|
137
|
+
print(
|
138
|
+
f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
|
139
|
+
)
|
140
|
+
|
141
|
+
# Drop columns with more than N percent missing values
|
142
|
+
missing_values = df.isnull().mean()
|
143
|
+
df = df.loc[:, missing_values < 0.7]
|
144
|
+
print("Dropping columns with more than 70% missing values:")
|
145
|
+
print(
|
146
|
+
f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
|
147
|
+
)
|
148
|
+
|
149
|
+
# Drop columns with only one unique value
|
150
|
+
unique_values = df.nunique()
|
151
|
+
df = df.loc[:, unique_values > 1]
|
152
|
+
print("Dropping columns with only one unique value:")
|
153
|
+
print(
|
154
|
+
f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
|
155
|
+
)
|
156
|
+
|
157
|
+
# Define the target variable for the model, representing loan default status.
|
158
|
+
df[target_column] = df[target_column].map({"Fully Paid": 0, "Charged Off": 1})
|
159
|
+
|
160
|
+
# Drop rows with NaN in target_column after mapping
|
161
|
+
df.dropna(subset=[target_column], inplace=True)
|
162
|
+
print("Dropping rows with missing target values:")
|
163
|
+
print(
|
164
|
+
f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
|
165
|
+
)
|
166
|
+
|
167
|
+
return df
|
168
|
+
|
169
|
+
|
170
|
+
def preprocess(df):
|
171
|
+
df = df.copy()
|
172
|
+
|
173
|
+
# Convert the target variable to integer type for modeling.
|
174
|
+
df[target_column] = df[target_column].astype(int)
|
175
|
+
|
176
|
+
# Keep rows where purpose is 'debt_consolidation' or 'credit_card'
|
177
|
+
df = df[df["purpose"].isin(["debt_consolidation", "credit_card"])]
|
178
|
+
print("Filtering 'purpose' to 'debt_consolidation' and 'credit_card':")
|
179
|
+
print(
|
180
|
+
f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
|
181
|
+
)
|
182
|
+
|
183
|
+
# Remove rows where grade is 'F' or 'G'
|
184
|
+
df = df[~df["grade"].isin(["F", "G"])]
|
185
|
+
print("Filtering out 'grade' F and G:")
|
186
|
+
print(
|
187
|
+
f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
|
188
|
+
)
|
189
|
+
|
190
|
+
# Remove rows where sub_grade starts with 'F' or 'G'
|
191
|
+
df = df[~df["sub_grade"].str.startswith(("F", "G"))]
|
192
|
+
print("Filtering out 'sub_grade' F and G:")
|
193
|
+
print(
|
194
|
+
f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
|
195
|
+
)
|
196
|
+
|
197
|
+
# Remove rows where home_ownership is 'OTHER', 'NONE', or 'ANY'
|
198
|
+
df = df[~df["home_ownership"].isin(["OTHER", "NONE", "ANY"])]
|
199
|
+
print("Filtering out 'home_ownership' OTHER, NONE, ANY:")
|
200
|
+
print(
|
201
|
+
f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
|
202
|
+
)
|
203
|
+
|
204
|
+
# Drop features that are not useful for modeling
|
205
|
+
df.drop(drop_features, axis=1, inplace=True)
|
206
|
+
print("Dropping specified features:")
|
207
|
+
print(
|
208
|
+
f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
|
209
|
+
)
|
210
|
+
|
211
|
+
# Drop rows with missing values
|
212
|
+
df.dropna(inplace=True)
|
213
|
+
print("Dropping rows with any missing values:")
|
214
|
+
print(
|
215
|
+
f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
|
216
|
+
)
|
217
|
+
|
218
|
+
# Preprocess emp_length column
|
219
|
+
df = _preprocess_emp_length(df)
|
220
|
+
|
221
|
+
# Preprocess term column
|
222
|
+
df = _preprocess_term(df)
|
223
|
+
|
224
|
+
return df
|
225
|
+
|
226
|
+
|
227
|
+
def _preprocess_term(df):
|
228
|
+
df = df.copy()
|
229
|
+
|
230
|
+
# Remove ' months' and convert to integer
|
231
|
+
df["term"] = df["term"].str.replace(" months", "").astype(object)
|
232
|
+
|
233
|
+
return df
|
234
|
+
|
235
|
+
|
236
|
+
def _preprocess_emp_length(df):
|
237
|
+
df = df.copy()
|
238
|
+
|
239
|
+
# Mapping string values to numbers
|
240
|
+
emp_length_map = {
|
241
|
+
"10+ years": 10,
|
242
|
+
"< 1 year": 0,
|
243
|
+
"1 year": 1,
|
244
|
+
"2 years": 2,
|
245
|
+
"3 years": 3,
|
246
|
+
"4 years": 4,
|
247
|
+
"5 years": 5,
|
248
|
+
"6 years": 6,
|
249
|
+
"7 years": 7,
|
250
|
+
"8 years": 8,
|
251
|
+
"9 years": 9,
|
252
|
+
}
|
253
|
+
|
254
|
+
# Apply the mapping to the emp_length column
|
255
|
+
df["emp_length"] = df["emp_length"].map(emp_length_map).astype(object)
|
256
|
+
|
257
|
+
# Drop rows where emp_length is NaN after mapping
|
258
|
+
# df.dropna(subset=["emp_length"], inplace=True)
|
259
|
+
|
260
|
+
return df
|
261
|
+
|
262
|
+
|
263
|
+
def feature_engineering(df):
|
264
|
+
df = df.copy()
|
265
|
+
|
266
|
+
# WoE encoding of numerical and categorical features
|
267
|
+
df = woe_encoding(df)
|
268
|
+
|
269
|
+
print(
|
270
|
+
f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
|
271
|
+
)
|
272
|
+
|
273
|
+
return df
|
274
|
+
|
275
|
+
|
276
|
+
def woe_encoding(df):
|
277
|
+
df = df.copy()
|
278
|
+
|
279
|
+
woe = _woebin(df)
|
280
|
+
bins = _woe_to_bins(woe)
|
281
|
+
|
282
|
+
# Make sure we don't transform the target column
|
283
|
+
if target_column in bins:
|
284
|
+
del bins[target_column]
|
285
|
+
print(f"Excluded {target_column} from WoE transformation.")
|
286
|
+
|
287
|
+
# Apply the WoE transformation
|
288
|
+
df = sc.woebin_ply(df, bins=bins)
|
289
|
+
|
290
|
+
print("Successfully converted features to WoE values.")
|
291
|
+
|
292
|
+
return df
|
293
|
+
|
294
|
+
|
295
|
+
def _woe_to_bins(woe):
|
296
|
+
# Select and rename columns
|
297
|
+
transformed_df = woe[
|
298
|
+
[
|
299
|
+
"variable",
|
300
|
+
"bin",
|
301
|
+
"count",
|
302
|
+
"count_distr",
|
303
|
+
"good",
|
304
|
+
"bad",
|
305
|
+
"badprob",
|
306
|
+
"woe",
|
307
|
+
"bin_iv",
|
308
|
+
"total_iv",
|
309
|
+
]
|
310
|
+
].copy()
|
311
|
+
transformed_df.rename(columns={"bin_iv": "total_iv"}, inplace=True)
|
312
|
+
|
313
|
+
# Create 'is_special_values' column (assuming there are no special values)
|
314
|
+
transformed_df["is_special_values"] = False
|
315
|
+
|
316
|
+
# Transform 'bin' column into interval format and store it in 'breaks' column
|
317
|
+
transformed_df["breaks"] = transformed_df["bin"].apply(
|
318
|
+
lambda x: "[-inf, %s)" % x if isinstance(x, float) else "[%s, inf)" % x
|
319
|
+
)
|
320
|
+
|
321
|
+
# Group by 'variable' to create bins dictionary
|
322
|
+
bins = {}
|
323
|
+
for variable, group in transformed_df.groupby("variable"):
|
324
|
+
bins[variable] = group
|
325
|
+
|
326
|
+
return bins
|
327
|
+
|
328
|
+
|
329
|
+
def _woebin(df):
|
330
|
+
"""
|
331
|
+
This function performs automatic binning using WoE.
|
332
|
+
df: A pandas dataframe
|
333
|
+
target_column: The target variable in quotes, e.g. 'loan_status'
|
334
|
+
"""
|
335
|
+
|
336
|
+
non_numeric_cols = df.select_dtypes(exclude=["int64", "float64"]).columns
|
337
|
+
df[non_numeric_cols] = df[non_numeric_cols].astype(str)
|
338
|
+
|
339
|
+
try:
|
340
|
+
print(
|
341
|
+
f"Performing binning with breaks_adj: {breaks_adj}"
|
342
|
+
) # print the breaks_adj being used
|
343
|
+
bins = sc.woebin(df, target_column, breaks_list=breaks_adj)
|
344
|
+
except Exception as e:
|
345
|
+
print("Error during binning: ")
|
346
|
+
print(e)
|
347
|
+
else:
|
348
|
+
bins_df = pd.concat(bins.values(), keys=bins.keys())
|
349
|
+
bins_df.reset_index(inplace=True)
|
350
|
+
bins_df.drop(columns=["variable"], inplace=True)
|
351
|
+
bins_df.rename(columns={"level_0": "variable"}, inplace=True)
|
352
|
+
|
353
|
+
bins_df["bin_number"] = bins_df.groupby("variable").cumcount()
|
354
|
+
|
355
|
+
return bins_df
|
356
|
+
|
357
|
+
|
358
|
+
def split(df, add_constant=False):
|
359
|
+
df = df.copy()
|
360
|
+
|
361
|
+
# Splitting the dataset into training and test sets
|
362
|
+
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
|
363
|
+
|
364
|
+
if add_constant:
|
365
|
+
# Add a constant to the model for both training and testing datasets
|
366
|
+
train_df = sm.add_constant(train_df)
|
367
|
+
test_df = sm.add_constant(test_df)
|
368
|
+
|
369
|
+
# Calculate and print details for the training dataset
|
370
|
+
print("After splitting the dataset into training and test sets:")
|
371
|
+
print(
|
372
|
+
f"Training Dataset:\nRows: {train_df.shape[0]}\nColumns: {train_df.shape[1]}\nMissing values: {train_df.isnull().sum().sum()}\n"
|
373
|
+
)
|
374
|
+
|
375
|
+
# Calculate and print details for the test dataset
|
376
|
+
print(
|
377
|
+
f"Test Dataset:\nRows: {test_df.shape[0]}\nColumns: {test_df.shape[1]}\nMissing values: {test_df.isnull().sum().sum()}\n"
|
378
|
+
)
|
379
|
+
|
380
|
+
return train_df, test_df
|
381
|
+
|
382
|
+
|
383
|
+
def compute_scores(probabilities):
|
384
|
+
|
385
|
+
target_score = score_params["target_score"]
|
386
|
+
target_odds = score_params["target_odds"]
|
387
|
+
pdo = score_params["pdo"]
|
388
|
+
|
389
|
+
factor = pdo / np.log(2)
|
390
|
+
offset = target_score - (factor * np.log(target_odds))
|
391
|
+
|
392
|
+
scores = offset + factor * np.log(probabilities / (1 - probabilities))
|
393
|
+
|
394
|
+
return scores
|
validmind/logging.py
CHANGED
@@ -68,10 +68,17 @@ def get_logger(name="validmind", log_level=None):
|
|
68
68
|
logger = logging.getLogger(name)
|
69
69
|
logger.setLevel(log_level or _get_log_level())
|
70
70
|
|
71
|
-
#
|
72
|
-
|
71
|
+
# Clear existing handlers if any (or refine the existing logic as necessary)
|
72
|
+
# TODO: lets add some better handler management
|
73
|
+
if not any(
|
74
|
+
isinstance(h, type(handler)) and h.formatter._fmt == formatter._fmt
|
75
|
+
for h in logger.handlers
|
76
|
+
):
|
73
77
|
logger.addHandler(handler)
|
74
78
|
|
79
|
+
# Prevent logger from propagating to root logger
|
80
|
+
logger.propagate = False
|
81
|
+
|
75
82
|
return logger
|
76
83
|
|
77
84
|
|
validmind/template.py
CHANGED
@@ -4,9 +4,9 @@
|
|
4
4
|
|
5
5
|
from pprint import pformat
|
6
6
|
|
7
|
+
import mistune
|
7
8
|
from IPython.display import display
|
8
9
|
from ipywidgets import HTML, Accordion, VBox
|
9
|
-
from markdown import markdown
|
10
10
|
|
11
11
|
from .html_templates.content_blocks import (
|
12
12
|
failed_content_block_html,
|
@@ -75,7 +75,7 @@ def _create_content_widget(content):
|
|
75
75
|
HTML(
|
76
76
|
test_content_block_html.format(
|
77
77
|
title=test_deets["Name"],
|
78
|
-
description=
|
78
|
+
description=mistune.html(test_deets["Description"]),
|
79
79
|
required_inputs=", ".join(
|
80
80
|
test_deets["Required Inputs"] or ["None"]
|
81
81
|
),
|
@@ -5,6 +5,8 @@
|
|
5
5
|
"""
|
6
6
|
Entrypoint for test suites.
|
7
7
|
"""
|
8
|
+
from inspect import getdoc
|
9
|
+
|
8
10
|
import pandas as pd
|
9
11
|
|
10
12
|
from ..logging import get_logger
|
@@ -139,7 +141,7 @@ def list_suites(pretty: bool = True):
|
|
139
141
|
{
|
140
142
|
"ID": suite_id,
|
141
143
|
"Name": test_suite.__name__,
|
142
|
-
"Description": test_suite.
|
144
|
+
"Description": getdoc(test_suite).strip(),
|
143
145
|
"Tests": ", ".join(_get_test_suite_test_ids(test_suite)),
|
144
146
|
}
|
145
147
|
)
|
@@ -167,7 +169,7 @@ def describe_suite(test_suite_id: str, verbose=False):
|
|
167
169
|
{
|
168
170
|
"ID": test_suite_id,
|
169
171
|
"Name": test_suite.__name__,
|
170
|
-
"Description": test_suite.
|
172
|
+
"Description": getdoc(test_suite).strip(),
|
171
173
|
"Tests": ", ".join(_get_test_suite_test_ids(test_suite)),
|
172
174
|
}
|
173
175
|
]
|