validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.24.dist-info/METADATA +0 -118
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,23 +2,13 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
6
|
-
from
|
5
|
+
from validmind.tests import tags, tasks
|
6
|
+
from validmind.vm_models import VMDataset
|
7
7
|
|
8
|
-
from ydata_profiling.config import Settings
|
9
|
-
from ydata_profiling.model.typeset import ProfilingTypeSet
|
10
8
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
ResultTableMetadata,
|
15
|
-
ThresholdTest,
|
16
|
-
ThresholdTestResult,
|
17
|
-
)
|
18
|
-
|
19
|
-
|
20
|
-
@dataclass
|
21
|
-
class TooManyZeroValues(ThresholdTest):
|
9
|
+
@tags("tabular_data")
|
10
|
+
@tasks("regression", "classification")
|
11
|
+
def TooManyZeroValues(dataset: VMDataset, max_percent_threshold: float = 0.03):
|
22
12
|
"""
|
23
13
|
Identifies numerical columns in a dataset that contain an excessive number of zero values, defined by a threshold
|
24
14
|
percentage.
|
@@ -70,65 +60,26 @@ class TooManyZeroValues(ThresholdTest):
|
|
70
60
|
- Cannot evaluate non-numerical or categorical columns, which might bring with them different types of concerns or
|
71
61
|
issues.
|
72
62
|
"""
|
63
|
+
df = dataset.df
|
64
|
+
|
65
|
+
table = []
|
66
|
+
|
67
|
+
for col in dataset.feature_columns_numeric:
|
68
|
+
value_counts = df[col].value_counts()
|
73
69
|
|
74
|
-
|
75
|
-
|
76
|
-
default_params = {"max_percent_threshold": 0.03}
|
70
|
+
if 0 not in value_counts.index:
|
71
|
+
continue
|
77
72
|
|
78
|
-
|
79
|
-
|
73
|
+
n_zeros = value_counts[0]
|
74
|
+
p_zeros = n_zeros / df.shape[0]
|
80
75
|
|
81
|
-
|
82
|
-
"""
|
83
|
-
The zeros test returns results like these:
|
84
|
-
[{"values": {"n_zeros": 10000, "p_zeros": 1.0}, "column": "Exited", "passed": true}]
|
85
|
-
"""
|
86
|
-
results_table = [
|
76
|
+
table.append(
|
87
77
|
{
|
88
|
-
"Column":
|
89
|
-
"Number of Zero Values":
|
90
|
-
"Percentage of Zero Values (%)":
|
91
|
-
"Pass/Fail": "Pass" if
|
78
|
+
"Column": col,
|
79
|
+
"Number of Zero Values": n_zeros,
|
80
|
+
"Percentage of Zero Values (%)": p_zeros * 100,
|
81
|
+
"Pass/Fail": "Pass" if p_zeros < max_percent_threshold else "Fail",
|
92
82
|
}
|
93
|
-
for result in results
|
94
|
-
]
|
95
|
-
return ResultSummary(
|
96
|
-
results=[
|
97
|
-
ResultTable(
|
98
|
-
data=results_table,
|
99
|
-
metadata=ResultTableMetadata(title="Zeros Results for Dataset"),
|
100
|
-
)
|
101
|
-
]
|
102
83
|
)
|
103
84
|
|
104
|
-
|
105
|
-
rows = self.inputs.dataset.df.shape[0]
|
106
|
-
typeset = ProfilingTypeSet(Settings())
|
107
|
-
dataset_types = typeset.infer_type(self.inputs.dataset.df)
|
108
|
-
results = []
|
109
|
-
|
110
|
-
for col in self.inputs.dataset.df.columns:
|
111
|
-
# Only calculate zeros for numerical columns
|
112
|
-
if str(dataset_types[col]) != "Numeric":
|
113
|
-
continue
|
114
|
-
|
115
|
-
value_counts = self.inputs.dataset.df[col].value_counts()
|
116
|
-
|
117
|
-
if 0 not in value_counts.index:
|
118
|
-
continue
|
119
|
-
|
120
|
-
n_zeros = value_counts[0]
|
121
|
-
p_zeros = n_zeros / rows
|
122
|
-
|
123
|
-
results.append(
|
124
|
-
ThresholdTestResult(
|
125
|
-
column=col,
|
126
|
-
passed=p_zeros < self.params["max_percent_threshold"],
|
127
|
-
values={
|
128
|
-
"n_zeros": n_zeros,
|
129
|
-
"p_zeros": p_zeros,
|
130
|
-
},
|
131
|
-
)
|
132
|
-
)
|
133
|
-
|
134
|
-
return self.cache_results(results, passed=all([r.passed for r in results]))
|
85
|
+
return table, all(row["Pass/Fail"] == "Pass" for row in table)
|
@@ -2,20 +2,13 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
6
|
-
from
|
5
|
+
from validmind import tags, tasks
|
6
|
+
from validmind.vm_models import VMDataset
|
7
7
|
|
8
|
-
from validmind.vm_models import (
|
9
|
-
ResultSummary,
|
10
|
-
ResultTable,
|
11
|
-
ResultTableMetadata,
|
12
|
-
ThresholdTest,
|
13
|
-
ThresholdTestResult,
|
14
|
-
)
|
15
8
|
|
16
|
-
|
17
|
-
@
|
18
|
-
|
9
|
+
@tags("tabular_data")
|
10
|
+
@tasks("regression", "classification")
|
11
|
+
def UniqueRows(dataset: VMDataset, min_percent_threshold: float = 1):
|
19
12
|
"""
|
20
13
|
Verifies the diversity of the dataset by ensuring that the count of unique rows exceeds a prescribed threshold.
|
21
14
|
|
@@ -57,53 +50,21 @@ class UniqueRows(ThresholdTest):
|
|
57
50
|
- This test may not be suitable or useful for categorical variables, where the count of unique categories is
|
58
51
|
inherently limited.
|
59
52
|
"""
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
79
|
-
}
|
80
|
-
for result in results
|
81
|
-
]
|
82
|
-
return ResultSummary(
|
83
|
-
results=[
|
84
|
-
ResultTable(
|
85
|
-
data=results_table,
|
86
|
-
metadata=ResultTableMetadata(
|
87
|
-
title="Unique Rows Results for Dataset"
|
88
|
-
),
|
89
|
-
)
|
90
|
-
]
|
91
|
-
)
|
92
|
-
|
93
|
-
def run(self):
|
94
|
-
rows = self.inputs.dataset.df.shape[0]
|
95
|
-
|
96
|
-
unique_rows = self.inputs.dataset.df.nunique()
|
97
|
-
results = [
|
98
|
-
ThresholdTestResult(
|
99
|
-
column=col,
|
100
|
-
passed=(unique_rows[col] / rows) < self.params["min_percent_threshold"],
|
101
|
-
values={
|
102
|
-
"n_unique": unique_rows[col],
|
103
|
-
"p_unique": unique_rows[col] / rows,
|
104
|
-
},
|
105
|
-
)
|
106
|
-
for col in unique_rows.index
|
107
|
-
]
|
108
|
-
|
109
|
-
return self.cache_results(results, passed=all([r.passed for r in results]))
|
53
|
+
df = dataset.df
|
54
|
+
|
55
|
+
rows = df.shape[0]
|
56
|
+
unique_rows = df.nunique()
|
57
|
+
|
58
|
+
table = [
|
59
|
+
{
|
60
|
+
"Column": col,
|
61
|
+
"Number of Unique Values": unique_rows[col],
|
62
|
+
"Percentage of Unique Values (%)": unique_rows[col] / rows * 100,
|
63
|
+
"Pass/Fail": (
|
64
|
+
"Pass" if unique_rows[col] / rows >= min_percent_threshold else "Fail"
|
65
|
+
),
|
66
|
+
}
|
67
|
+
for col in unique_rows.index
|
68
|
+
]
|
69
|
+
|
70
|
+
return table, all(row["Pass/Fail"] == "Pass" for row in table)
|
@@ -2,8 +2,6 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import numpy as np
|
8
6
|
import pandas as pd
|
9
7
|
import plotly.express as px
|
@@ -11,11 +9,22 @@ import plotly.graph_objects as go
|
|
11
9
|
import scorecardpy as sc
|
12
10
|
from plotly.subplots import make_subplots
|
13
11
|
|
14
|
-
from validmind
|
12
|
+
from validmind import tags, tasks
|
13
|
+
from validmind.errors import SkipTestError
|
14
|
+
from validmind.logging import get_logger
|
15
|
+
from validmind.vm_models import VMDataset
|
16
|
+
|
17
|
+
logger = get_logger(__name__)
|
15
18
|
|
16
19
|
|
17
|
-
@
|
18
|
-
|
20
|
+
@tags("tabular_data", "visualization", "categorical_data")
|
21
|
+
@tasks("classification")
|
22
|
+
def WOEBinPlots(
|
23
|
+
dataset: VMDataset,
|
24
|
+
breaks_adj: list = None,
|
25
|
+
fig_height: int = 600,
|
26
|
+
fig_width: int = 500,
|
27
|
+
):
|
19
28
|
"""
|
20
29
|
Generates visualizations of Weight of Evidence (WoE) and Information Value (IV) for understanding predictive power
|
21
30
|
of categorical variables in a data set.
|
@@ -63,107 +72,72 @@ class WOEBinPlots(Metric):
|
|
63
72
|
- The method requires a sufficient number of events per bin to generate a reliable information value and weight of
|
64
73
|
evidence.
|
65
74
|
"""
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
fig.add_trace(
|
137
|
-
go.Scatter(
|
138
|
-
x=variable_df["bin"],
|
139
|
-
y=variable_df["woe"],
|
140
|
-
mode="lines+markers",
|
141
|
-
marker=dict(symbol="circle", size=6),
|
142
|
-
hovertemplate="<b>%{x}</b><br>" + "WoE: %{y}<extra></extra>",
|
143
|
-
),
|
144
|
-
row=1,
|
145
|
-
col=2,
|
146
|
-
)
|
147
|
-
fig.update_xaxes(
|
148
|
-
ticktext=variable_df["bin"].tolist(),
|
149
|
-
tickvals=np.arange(len(variable_df["bin"])),
|
150
|
-
row=1,
|
151
|
-
col=2,
|
152
|
-
)
|
153
|
-
|
154
|
-
fig.update_layout(
|
155
|
-
title=f"IV and WoE for {variable}",
|
156
|
-
height=fig_height,
|
157
|
-
width=fig_width,
|
158
|
-
showlegend=False,
|
159
|
-
)
|
160
|
-
|
161
|
-
figures.append(
|
162
|
-
Figure(
|
163
|
-
for_object=self,
|
164
|
-
key=f"{self.key}:{variable}",
|
165
|
-
figure=fig,
|
166
|
-
)
|
167
|
-
)
|
168
|
-
|
169
|
-
return self.cache_results(figures=figures)
|
75
|
+
df = dataset.df
|
76
|
+
|
77
|
+
non_numeric_cols = df.select_dtypes(exclude=["int64", "float64"]).columns
|
78
|
+
df[non_numeric_cols] = df[non_numeric_cols].astype(str)
|
79
|
+
|
80
|
+
try:
|
81
|
+
bins = sc.woebin(df, dataset.target_column, breaks_list=breaks_adj)
|
82
|
+
except Exception as e:
|
83
|
+
raise SkipTestError(f"Error performing binning: {e}") from e
|
84
|
+
|
85
|
+
woe_iv_df = (
|
86
|
+
pd.concat(bins.values(), keys=bins.keys())
|
87
|
+
.reset_index()
|
88
|
+
.drop(columns=["variable"])
|
89
|
+
.rename(columns={"level_0": "variable", "level_1": "bin_number"})
|
90
|
+
)
|
91
|
+
|
92
|
+
figures = []
|
93
|
+
|
94
|
+
for variable in woe_iv_df["variable"].unique():
|
95
|
+
variable_df = woe_iv_df[woe_iv_df["variable"] == variable]
|
96
|
+
|
97
|
+
fig = make_subplots(rows=1, cols=2)
|
98
|
+
|
99
|
+
fig.add_trace(
|
100
|
+
go.Bar(
|
101
|
+
x=variable_df["bin"],
|
102
|
+
y=variable_df["bin_iv"],
|
103
|
+
marker_color=px.colors.qualitative.Plotly[: len(variable_df["bin"])],
|
104
|
+
hovertemplate="<b>%{x}</b><br>" + "IV: %{y}<extra></extra>",
|
105
|
+
),
|
106
|
+
row=1,
|
107
|
+
col=1,
|
108
|
+
)
|
109
|
+
fig.update_xaxes(
|
110
|
+
ticktext=variable_df["bin"].tolist(),
|
111
|
+
tickvals=np.arange(len(variable_df["bin"])),
|
112
|
+
row=1,
|
113
|
+
col=1,
|
114
|
+
)
|
115
|
+
|
116
|
+
fig.add_trace(
|
117
|
+
go.Scatter(
|
118
|
+
x=variable_df["bin"],
|
119
|
+
y=variable_df["woe"],
|
120
|
+
mode="lines+markers",
|
121
|
+
marker=dict(symbol="circle", size=6),
|
122
|
+
hovertemplate="<b>%{x}</b><br>" + "WoE: %{y}<extra></extra>",
|
123
|
+
),
|
124
|
+
row=1,
|
125
|
+
col=2,
|
126
|
+
)
|
127
|
+
fig.update_xaxes(
|
128
|
+
ticktext=variable_df["bin"].tolist(),
|
129
|
+
tickvals=np.arange(len(variable_df["bin"])),
|
130
|
+
row=1,
|
131
|
+
col=2,
|
132
|
+
)
|
133
|
+
|
134
|
+
fig.update_layout(
|
135
|
+
title=f"IV and WoE for {variable}",
|
136
|
+
height=fig_height,
|
137
|
+
width=fig_width,
|
138
|
+
showlegend=False,
|
139
|
+
)
|
140
|
+
|
141
|
+
figures.append(fig)
|
142
|
+
|
143
|
+
return tuple(figures)
|
@@ -2,16 +2,17 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import pandas as pd
|
8
6
|
import scorecardpy as sc
|
9
7
|
|
10
|
-
from validmind
|
8
|
+
from validmind import tags, tasks
|
9
|
+
from validmind.errors import SkipTestError
|
10
|
+
from validmind.vm_models import VMDataset
|
11
11
|
|
12
12
|
|
13
|
-
@
|
14
|
-
|
13
|
+
@tags("tabular_data", "categorical_data")
|
14
|
+
@tasks("classification")
|
15
|
+
def WOEBinTable(dataset: VMDataset, breaks_adj: list = None):
|
15
16
|
"""
|
16
17
|
Assesses the Weight of Evidence (WoE) and Information Value (IV) of each feature to evaluate its predictive power
|
17
18
|
in a binary classification model.
|
@@ -26,9 +27,10 @@ class WOEBinTable(Metric):
|
|
26
27
|
### Test Mechanism
|
27
28
|
|
28
29
|
The test uses the `scorecardpy.woebin` method to perform automatic binning of the dataset based on WoE. The method
|
29
|
-
|
30
|
-
used to calculate the WoE and IV values, effectively creating
|
31
|
-
and IV values for each feature. A target variable is required
|
30
|
+
accepts a list of break points for binning numeric variables through the parameter `breaks_adj`. If no breaks are
|
31
|
+
provided, it uses default binning. The bins are then used to calculate the WoE and IV values, effectively creating
|
32
|
+
a dataframe that includes the bin boundaries, WoE, and IV values for each feature. A target variable is required
|
33
|
+
in the dataset to perform this analysis.
|
32
34
|
|
33
35
|
### Signs of High Risk
|
34
36
|
|
@@ -49,65 +51,22 @@ class WOEBinTable(Metric):
|
|
49
51
|
- The metric does not help in distinguishing whether the observed predictive factor is due to data randomness or a
|
50
52
|
true phenomenon.
|
51
53
|
"""
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
return self.cache_results(
|
70
|
-
{
|
71
|
-
"woe_iv": bins_df.to_dict(orient="records"),
|
72
|
-
}
|
73
|
-
)
|
74
|
-
|
75
|
-
def binning_data(self, df, y, breaks_adj=None):
|
76
|
-
"""
|
77
|
-
This function performs automatic binning using WoE.
|
78
|
-
df: A pandas dataframe
|
79
|
-
y: The target variable in quotes, e.g. 'target'
|
80
|
-
"""
|
81
|
-
non_numeric_cols = df.select_dtypes(exclude=["int64", "float64"]).columns
|
82
|
-
df[non_numeric_cols] = df[non_numeric_cols].astype(str)
|
83
|
-
|
84
|
-
try:
|
85
|
-
print(
|
86
|
-
f"Performing binning with breaks_adj: {breaks_adj}"
|
87
|
-
) # print the breaks_adj being used
|
88
|
-
bins = sc.woebin(df, y, breaks_list=breaks_adj)
|
89
|
-
except Exception as e:
|
90
|
-
print("Error during binning: ")
|
91
|
-
print(e)
|
92
|
-
else:
|
93
|
-
bins_df = pd.concat(bins.values(), keys=bins.keys())
|
94
|
-
bins_df.reset_index(inplace=True)
|
95
|
-
bins_df.drop(columns=["variable"], inplace=True)
|
96
|
-
bins_df.rename(columns={"level_0": "variable"}, inplace=True)
|
97
|
-
|
98
|
-
bins_df["bin_number"] = bins_df.groupby("variable").cumcount()
|
99
|
-
|
100
|
-
return bins_df
|
101
|
-
|
102
|
-
def summary(self, metric_value):
|
103
|
-
summary_woe_iv_table = metric_value["woe_iv"]
|
104
|
-
return ResultSummary(
|
105
|
-
results=[
|
106
|
-
ResultTable(
|
107
|
-
data=summary_woe_iv_table,
|
108
|
-
metadata=ResultTableMetadata(
|
109
|
-
title="Weight of Evidence (WoE) and Information Value (IV)"
|
110
|
-
),
|
111
|
-
)
|
112
|
-
]
|
54
|
+
df = dataset.df
|
55
|
+
|
56
|
+
non_numeric_cols = df.select_dtypes(exclude=["int64", "float64"]).columns
|
57
|
+
df[non_numeric_cols] = df[non_numeric_cols].astype(str)
|
58
|
+
|
59
|
+
try:
|
60
|
+
bins = sc.woebin(df, dataset.target_column, breaks_list=breaks_adj)
|
61
|
+
except Exception as e:
|
62
|
+
raise SkipTestError(f"Error during binning: {e}")
|
63
|
+
|
64
|
+
return {
|
65
|
+
"Weight of Evidence (WoE) and Information Value (IV)": (
|
66
|
+
pd.concat(bins.values(), keys=bins.keys())
|
67
|
+
.reset_index()
|
68
|
+
.drop(columns=["variable"])
|
69
|
+
.rename(columns={"level_0": "variable"})
|
70
|
+
.assign(bin_number=lambda x: x.groupby("variable").cumcount())
|
113
71
|
)
|
72
|
+
}
|
@@ -2,20 +2,21 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import pandas as pd
|
8
6
|
from arch.unitroot import ZivotAndrews
|
9
7
|
from numpy.linalg import LinAlgError
|
10
8
|
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.errors import SkipTestError
|
11
11
|
from validmind.logging import get_logger
|
12
|
-
from validmind.vm_models import
|
12
|
+
from validmind.vm_models import VMDataset
|
13
13
|
|
14
14
|
logger = get_logger(__name__)
|
15
15
|
|
16
16
|
|
17
|
-
@
|
18
|
-
|
17
|
+
@tags("time_series_data", "stationarity", "unit_root_test")
|
18
|
+
@tasks("regression")
|
19
|
+
def ZivotAndrewsArch(dataset: VMDataset):
|
19
20
|
"""
|
20
21
|
Evaluates the order of integration and stationarity of time series data using the Zivot-Andrews unit root test.
|
21
22
|
|
@@ -54,75 +55,32 @@ class ZivotAndrewsArch(Metric):
|
|
54
55
|
- May not account for unexpected shocks or changes in the series trend, both of which can significantly impact data
|
55
56
|
stationarity.
|
56
57
|
"""
|
58
|
+
df = dataset.df
|
59
|
+
if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
|
60
|
+
raise SkipTestError(
|
61
|
+
"Dataset index must be a datetime or period index for time series analysis."
|
62
|
+
)
|
57
63
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
"
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
"Dataset contains missing values. Rows with NaNs will be dropped."
|
79
|
-
)
|
80
|
-
dataset = dataset.dropna()
|
81
|
-
|
82
|
-
# Convert to numeric and handle non-numeric data
|
83
|
-
dataset = dataset.apply(pd.to_numeric, errors="coerce")
|
84
|
-
|
85
|
-
# Initialize a list to store Zivot-Andrews results
|
86
|
-
za_values = []
|
87
|
-
|
88
|
-
for col in dataset.columns:
|
89
|
-
try:
|
90
|
-
za = ZivotAndrews(dataset[col].values)
|
91
|
-
za_values.append(
|
92
|
-
{
|
93
|
-
"Variable": col,
|
94
|
-
"stat": za.stat,
|
95
|
-
"pvalue": za.pvalue,
|
96
|
-
"usedlag": za.lags,
|
97
|
-
"nobs": za.nobs,
|
98
|
-
}
|
99
|
-
)
|
100
|
-
except (LinAlgError, ValueError) as e:
|
101
|
-
logger.error(f"Error while processing column '{col}'. Details: {e}")
|
102
|
-
za_values.append(
|
103
|
-
{
|
104
|
-
"Variable": col,
|
105
|
-
"stat": None,
|
106
|
-
"pvalue": None,
|
107
|
-
"usedlag": None,
|
108
|
-
"nobs": None,
|
109
|
-
"error": str(e),
|
110
|
-
}
|
111
|
-
)
|
112
|
-
|
113
|
-
return self.cache_results({"zivot_andrews_results": za_values})
|
114
|
-
|
115
|
-
def summary(self, metric_value):
|
116
|
-
"""
|
117
|
-
Build a table for summarizing the Zivot-Andrews results
|
118
|
-
"""
|
119
|
-
za_results = metric_value["zivot_andrews_results"]
|
120
|
-
|
121
|
-
return ResultSummary(
|
122
|
-
results=[
|
123
|
-
ResultTable(
|
124
|
-
data=za_results,
|
125
|
-
metadata=ResultTableMetadata(title="Zivot-Andrews Test Results"),
|
126
|
-
)
|
127
|
-
]
|
64
|
+
df = df.dropna()
|
65
|
+
df = df.apply(pd.to_numeric, errors="coerce")
|
66
|
+
|
67
|
+
za_values = []
|
68
|
+
|
69
|
+
for col in df.columns:
|
70
|
+
try:
|
71
|
+
za = ZivotAndrews(df[col].values)
|
72
|
+
except (LinAlgError, ValueError) as e:
|
73
|
+
logger.error(f"Error while processing column '{col}': {e}")
|
74
|
+
continue
|
75
|
+
|
76
|
+
za_values.append(
|
77
|
+
{
|
78
|
+
"Variable": col,
|
79
|
+
"stat": za.stat,
|
80
|
+
"pvalue": za.pvalue,
|
81
|
+
"usedlag": za.lags,
|
82
|
+
"nobs": za.nobs,
|
83
|
+
}
|
128
84
|
)
|
85
|
+
|
86
|
+
return {"Zivot-Andrews Test Results": za_values}
|