validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +80 -119
- validmind/ai/test_result_description/config.yaml +29 -0
- validmind/ai/test_result_description/context.py +73 -0
- validmind/ai/test_result_description/image_processing.py +124 -0
- validmind/ai/test_result_description/system.jinja +39 -0
- validmind/ai/test_result_description/user.jinja +25 -0
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/credit_risk/__init__.py +1 -0
- validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
- validmind/datasets/credit_risk/lending_club_bias.py +142 -0
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +14 -15
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/JarqueBera.py +70 -0
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LJungBox.py +66 -0
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
- validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
- validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
- validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/RunsTest.py +72 -0
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +42 -40
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +39 -36
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +38 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/utils.py +4 -0
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/metric.py +1 -0
- validmind/vm_models/test/result_wrapper.py +143 -158
- validmind/vm_models/test/threshold_test.py +1 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
- validmind-2.5.18.dist-info/RECORD +324 -0
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
- validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
- validmind-2.5.8.dist-info/RECORD +0 -318
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -14,49 +14,52 @@ from validmind.vm_models import Figure, Metric
|
|
14
14
|
@dataclass
|
15
15
|
class RegressionModelForecastPlotLevels(Metric):
|
16
16
|
"""
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
-
|
59
|
-
|
17
|
+
Assesses the alignment between forecasted and observed values in regression models through visual plots, including
|
18
|
+
handling data transformations.
|
19
|
+
|
20
|
+
### Purpose
|
21
|
+
|
22
|
+
The `RegressionModelForecastPlotLevels` test aims to visually assess the performance of a series of regression
|
23
|
+
models by comparing their forecasted values against the actual observed values in both training and test datasets.
|
24
|
+
This test helps determine the accuracy of the models and can handle specific data transformations before making the
|
25
|
+
comparison, providing a comprehensive evaluation of model performance.
|
26
|
+
|
27
|
+
### Test Mechanism
|
28
|
+
|
29
|
+
The test mechanism involves initializing the `RegressionModelForecastPlotLevels` class with an optional
|
30
|
+
`transformation` parameter. The class then:
|
31
|
+
|
32
|
+
- Checks for the presence of model objects and raises a `ValueError` if none are found.
|
33
|
+
- Processes each model to generate predictive forecasts for both training and testing datasets.
|
34
|
+
- Contrasts these forecasts with the actual observed values.
|
35
|
+
- Produces plots to visually compare forecasted and observed values for both raw and transformed datasets.
|
36
|
+
- Handles specified transformations (e.g., "integrate") by performing cumulative sums to create a new series before
|
37
|
+
plotting.
|
38
|
+
|
39
|
+
### Signs of High Risk
|
40
|
+
|
41
|
+
- Significant deviation between forecasted and observed values in training or testing datasets.
|
42
|
+
- Patterns suggesting overfitting or underfitting.
|
43
|
+
- Large discrepancies in the plotted forecasts, indicating potential issues with model generalizability and
|
44
|
+
precision.
|
45
|
+
|
46
|
+
### Strengths
|
47
|
+
|
48
|
+
- **Visual Evaluations**: Provides an intuitive, visual way to assess multiple regression models, aiding in easier
|
49
|
+
interpretation and evaluation of forecast accuracy.
|
50
|
+
- **Transformation Handling**: Can process specified data transformations such as "integrate," enhancing
|
51
|
+
flexibility.
|
52
|
+
- **Detailed Perspective**: Assesses performance on both training and testing datasets, offering a comprehensive
|
53
|
+
view of model behavior.
|
54
|
+
|
55
|
+
### Limitations
|
56
|
+
|
57
|
+
- **Subjectivity**: Relies heavily on visual interpretation, which may vary between individuals.
|
58
|
+
- **Limited Transformation Capability**: Supports only the "integrate" transformation; other complex
|
59
|
+
transformations might not be handled.
|
60
|
+
- **Overhead**: Plotting can be computationally intensive for large datasets, increasing runtime.
|
61
|
+
- **Numerical Measurement**: Does not provide a numerical metric to quantify forecast accuracy, relying solely on
|
62
|
+
visual assessment.
|
60
63
|
"""
|
61
64
|
|
62
65
|
name = "regression_forecast_plot_levels"
|
@@ -16,44 +16,46 @@ logger = get_logger(__name__)
|
|
16
16
|
@dataclass
|
17
17
|
class RegressionModelSensitivityPlot(Metric):
|
18
18
|
"""
|
19
|
-
|
20
|
-
visualizing the
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
19
|
+
Assesses the sensitivity of a regression model to changes in independent variables by applying shocks and
|
20
|
+
visualizing the impact.
|
21
|
+
|
22
|
+
### Purpose
|
23
|
+
|
24
|
+
The Regression Sensitivity Plot test is designed to perform sensitivity analysis on regression models. This test
|
25
|
+
aims to measure the impact of slight changes (shocks) applied to individual variables on the system's outcome while
|
26
|
+
keeping all other variables constant. By doing so, it analyzes the effects of each independent variable on the
|
27
|
+
dependent variable within the regression model, helping identify significant risk factors that could substantially
|
28
|
+
influence the model's output.
|
29
|
+
|
30
|
+
### Test Mechanism
|
31
|
+
|
32
|
+
This test operates by initially applying shocks of varying magnitudes, defined by specific parameters, to each of
|
33
|
+
the model's features, one at a time. With all other variables held constant, a new prediction is made for each
|
34
|
+
dataset subjected to shocks. Any changes in the model's predictions are directly attributed to the shocks applied.
|
35
|
+
If the transformation parameter is set to "integrate," initial predictions and target values undergo transformation
|
36
|
+
via an integration function before being plotted. Finally, a plot demonstrating observed values against predicted
|
37
|
+
values for each model is generated, showcasing a distinct line graph illustrating predictions for each shock.
|
38
|
+
|
39
|
+
### Signs of High Risk
|
40
|
+
|
41
|
+
- Drastic alterations in model predictions due to minor shocks to an individual variable, indicating high
|
42
|
+
sensitivity and potential over-dependence on that variable.
|
43
|
+
- Unusually high or unpredictable shifts in response to shocks, suggesting potential model instability or
|
41
44
|
overfitting.
|
42
45
|
|
43
|
-
|
44
|
-
|
45
|
-
understanding feature importance.
|
46
|
-
-
|
47
|
-
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
reflect
|
53
|
-
-
|
54
|
-
|
55
|
-
|
56
|
-
subjectivity in interpretation.
|
46
|
+
### Strengths
|
47
|
+
|
48
|
+
- Helps identify variables that strongly influence model outcomes, aiding in understanding feature importance.
|
49
|
+
- Generates visual plots, making results easily interpretable even to non-technical stakeholders.
|
50
|
+
- Useful in identifying overfitting and detecting unstable models that react excessively to minor variable changes.
|
51
|
+
|
52
|
+
### Limitations
|
53
|
+
|
54
|
+
- Operates on the assumption that all other variables remain unchanged during the application of a shock, which may
|
55
|
+
not reflect real-world interdependencies.
|
56
|
+
- Best compatible with linear models and may not effectively evaluate the sensitivity of non-linear models.
|
57
|
+
- Provides a visual representation without a numerical risk measure, potentially introducing subjectivity in
|
58
|
+
interpretation.
|
57
59
|
"""
|
58
60
|
|
59
61
|
name = "regression_sensitivity_plot"
|
@@ -17,36 +17,38 @@ class RegressionModelSummary(Metric):
|
|
17
17
|
"""
|
18
18
|
Evaluates regression model performance using metrics including R-Squared, Adjusted R-Squared, MSE, and RMSE.
|
19
19
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
20
|
+
### Purpose
|
21
|
+
|
22
|
+
The Regression Model Summary test evaluates the performance of regression models by measuring their predictive
|
23
|
+
ability regarding dependent variables given changes in the independent variables. It uses conventional regression
|
24
|
+
metrics such as R-Squared, Adjusted R-Squared, Mean Squared Error (MSE), and Root Mean Squared Error (RMSE) to
|
25
|
+
assess the model's accuracy and fit.
|
26
|
+
|
27
|
+
### Test Mechanism
|
28
|
+
|
29
|
+
This test employs the 'train_ds' attribute of the model to gather and analyze the training data. Initially, it
|
30
|
+
fetches the independent variables and uses the model to make predictions on these given features. Subsequently, it
|
31
|
+
calculates several standard regression performance metrics including R-Squared, Adjusted R-Squared, Mean Squared
|
32
|
+
Error (MSE), and Root Mean Squared Error (RMSE), which quantify the approximation of the predicted responses to the
|
33
|
+
actual responses.
|
34
|
+
|
35
|
+
### Signs of High Risk
|
36
|
+
|
37
|
+
- Low R-Squared and Adjusted R-Squared values.
|
38
|
+
- High MSE and RMSE values.
|
39
|
+
|
40
|
+
### Strengths
|
41
|
+
|
38
42
|
- Offers an extensive evaluation of regression models by combining four key measures of model accuracy and fit.
|
39
43
|
- Provides a comprehensive view of the model's performance.
|
40
|
-
- Both the R-Squared and Adjusted R-Squared measures are readily interpretable.
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
- Applicable exclusively to regression models.
|
45
|
-
|
46
|
-
-
|
47
|
-
|
48
|
-
- A high R-squared or adjusted R-squared may not necessarily indicate a good model, especially in cases where the
|
49
|
-
model is possibly overfitting the data.
|
44
|
+
- Both the R-Squared and Adjusted R-Squared measures are readily interpretable.
|
45
|
+
|
46
|
+
### Limitations
|
47
|
+
|
48
|
+
- Applicable exclusively to regression models.
|
49
|
+
- RMSE and MSE might be sensitive to outliers.
|
50
|
+
- A high R-Squared or Adjusted R-Squared may not necessarily indicate a good model, especially in cases of
|
51
|
+
overfitting.
|
50
52
|
"""
|
51
53
|
|
52
54
|
name = "regression_model_summary"
|
@@ -21,28 +21,35 @@ logger = get_logger(__name__)
|
|
21
21
|
class RegressionPermutationFeatureImportance(Metric):
|
22
22
|
"""
|
23
23
|
Assesses the significance of each feature in a model by evaluating the impact on model performance when feature
|
24
|
-
values are randomly rearranged.
|
25
|
-
importance of features based on the decrease in model's predictive accuracy, typically R².
|
24
|
+
values are randomly rearranged.
|
26
25
|
|
27
|
-
|
28
|
-
|
26
|
+
### Purpose
|
27
|
+
|
28
|
+
The primary purpose of this metric is to determine which features significantly impact the performance of a
|
29
|
+
regression model developed using statsmodels. The metric measures how much the prediction accuracy deteriorates
|
29
30
|
when each feature's values are permuted.
|
30
31
|
|
31
|
-
|
32
|
-
|
32
|
+
### Test Mechanism
|
33
|
+
|
34
|
+
This metric shuffles the values of each feature one at a time in the dataset, computes the model's performance
|
35
|
+
after each permutation, and compares it to the baseline performance. A significant decrease in performance
|
33
36
|
indicates the importance of the feature.
|
34
37
|
|
35
|
-
|
36
|
-
|
38
|
+
### Signs of High Risk
|
39
|
+
|
40
|
+
- Significant reliance on a feature that, when permuted, leads to a substantial decrease in performance, suggesting
|
37
41
|
overfitting or high model dependency on that feature.
|
38
42
|
- Features identified as unimportant despite known impacts from domain knowledge, suggesting potential issues in
|
39
43
|
model training or data preprocessing.
|
40
44
|
|
41
|
-
|
42
|
-
|
45
|
+
### Strengths
|
46
|
+
|
47
|
+
- Directly assesses the impact of each feature on model performance, providing clear insights into model
|
48
|
+
dependencies.
|
43
49
|
- Model-agnostic within the scope of statsmodels, applicable to any regression model that outputs predictions.
|
44
50
|
|
45
|
-
|
51
|
+
### Limitations
|
52
|
+
|
46
53
|
- The metric is specific to statsmodels and cannot be used with other types of models without adaptation.
|
47
54
|
- It does not capture interactions between features, which can lead to underestimating the importance of correlated
|
48
55
|
features.
|
@@ -2,136 +2,104 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import plotly.graph_objects as go
|
8
6
|
from matplotlib import cm
|
9
7
|
|
10
|
-
from validmind
|
8
|
+
from validmind import tags, tasks
|
11
9
|
|
12
10
|
|
13
|
-
@
|
14
|
-
|
11
|
+
@tags("visualization", "credit_risk", "logistic_regression")
|
12
|
+
@tasks("classification")
|
13
|
+
def ScorecardHistogram(dataset, title="Histogram of Scores", score_column="score"):
|
15
14
|
"""
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
15
|
+
The Scorecard Histogram test evaluates the distribution of credit scores between default and non-default instances,
|
16
|
+
providing critical insights into the performance and generalizability of credit-risk models.
|
17
|
+
|
18
|
+
### Purpose
|
19
|
+
|
20
|
+
The Scorecard Histogram test metric provides a visual interpretation of the credit scores generated by a machine
|
21
|
+
learning model for credit-risk classification tasks. It aims to compare the alignment of the model's scoring
|
22
|
+
decisions with the actual outcomes of credit loan applications. It helps in identifying potential discrepancies
|
23
|
+
between the model's predictions and real-world risk levels.
|
24
|
+
|
25
|
+
### Test Mechanism
|
26
|
+
|
27
|
+
This metric uses logistic regression to generate a histogram of credit scores for both default (negative class) and
|
28
|
+
non-default (positive class) instances. Using both training and test datasets, the metric calculates the credit
|
29
|
+
score of each instance with a scorecard method, considering the impact of different features on the likelihood of
|
30
|
+
default. It includes the default point to odds (PDO) scaling factor and predefined target score and odds settings.
|
31
|
+
Histograms for training and test sets are computed and plotted separately to offer insights into the model's
|
32
|
+
generalizability to unseen data.
|
33
|
+
|
34
|
+
### Signs of High Risk
|
35
|
+
|
36
|
+
- Discrepancies between the distributions of training and testing data, indicating a model's poor generalization
|
32
37
|
ability
|
33
|
-
- Skewed distributions
|
38
|
+
- Skewed distributions favoring specific scores or classes, representing potential bias
|
39
|
+
|
40
|
+
### Strengths
|
34
41
|
|
35
|
-
**Strengths**:
|
36
42
|
- Provides a visual interpretation of the model's credit scoring system, enhancing comprehension of model behavior
|
37
43
|
- Enables a direct comparison between actual and predicted scores for both training and testing data
|
38
44
|
- Its intuitive visualization helps understand the model's ability to differentiate between positive and negative
|
39
45
|
classes
|
40
46
|
- Can unveil patterns or anomalies not easily discerned through numerical metrics alone
|
41
47
|
|
42
|
-
|
43
|
-
|
48
|
+
### Limitations
|
49
|
+
|
50
|
+
- Despite its value for visual interpretation, it doesn't quantify the performance of the model and therefore may
|
44
51
|
lack precision for thorough model evaluation
|
45
52
|
- The quality of input data can strongly influence the metric, as bias or noise in the data will affect both the
|
46
53
|
score calculation and resultant histogram
|
47
54
|
- Its specificity to credit scoring models limits its applicability across a wider variety of machine learning
|
48
55
|
tasks and models
|
49
|
-
- The metric's effectiveness is somewhat tied to the subjective interpretation of the analyst,
|
50
|
-
|
56
|
+
- The metric's effectiveness is somewhat tied to the subjective interpretation of the analyst, relying on their
|
57
|
+
judgment of the characteristics and implications of the plot.
|
51
58
|
"""
|
52
59
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
60
|
+
if score_column not in dataset.df.columns:
|
61
|
+
raise ValueError(
|
62
|
+
f"The required column '{score_column}' is not present in the dataset with input_id {dataset.input_id}"
|
63
|
+
)
|
57
64
|
|
58
|
-
|
59
|
-
"title": "Histogram of Scores",
|
60
|
-
"score_column": "score",
|
61
|
-
}
|
65
|
+
df = dataset.df
|
62
66
|
|
63
|
-
|
64
|
-
def plot_score_histogram(dataframes, dataset_titles, score_col, target_col, title):
|
65
|
-
figures = []
|
66
|
-
# Generate a colormap and convert to Plotly-accepted color format
|
67
|
-
# Adjust 'viridis' to any other matplotlib colormap if desired
|
68
|
-
colormap = cm.get_cmap("viridis")
|
69
|
-
|
70
|
-
for _, (df, dataset_title) in enumerate(zip(dataframes, dataset_titles)):
|
71
|
-
fig = go.Figure()
|
72
|
-
|
73
|
-
# Get unique classes and assign colors
|
74
|
-
classes = sorted(df[target_col].unique())
|
75
|
-
colors = [
|
76
|
-
colormap(i / len(classes))[:3] for i in range(len(classes))
|
77
|
-
] # RGB
|
78
|
-
color_dict = {
|
79
|
-
cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
|
80
|
-
for cls, rgb in zip(classes, colors)
|
81
|
-
}
|
82
|
-
|
83
|
-
for class_value in sorted(df[target_col].unique()):
|
84
|
-
scores_class = df[df[target_col] == class_value][score_col]
|
85
|
-
fig.add_trace(
|
86
|
-
go.Histogram(
|
87
|
-
x=scores_class,
|
88
|
-
opacity=0.75,
|
89
|
-
name=f"{dataset_title} {target_col} = {class_value}",
|
90
|
-
marker=dict(
|
91
|
-
color=color_dict[class_value],
|
92
|
-
),
|
93
|
-
)
|
94
|
-
)
|
95
|
-
fig.update_layout(
|
96
|
-
barmode="overlay",
|
97
|
-
title_text=f"{title} - {dataset_title}",
|
98
|
-
xaxis_title="Score",
|
99
|
-
yaxis_title="Frequency",
|
100
|
-
legend_title=target_col,
|
101
|
-
)
|
102
|
-
figures.append(fig)
|
103
|
-
return figures
|
104
|
-
|
105
|
-
def run(self):
|
106
|
-
title = self.params["title"]
|
107
|
-
score_column = self.params["score_column"]
|
108
|
-
dataset_titles = [dataset.input_id for dataset in self.inputs.datasets]
|
109
|
-
target_column = self.inputs.datasets[0].target_column
|
110
|
-
|
111
|
-
dataframes = []
|
112
|
-
metric_value = {"score_histogram": {}}
|
113
|
-
for dataset in self.inputs.datasets:
|
114
|
-
if score_column not in dataset.df.columns:
|
115
|
-
raise ValueError(
|
116
|
-
f"The required column '{score_column}' is not present in the dataset with input_id {dataset.input_id}"
|
117
|
-
)
|
118
|
-
|
119
|
-
dataframes.append(dataset.df.copy())
|
120
|
-
metric_value["score_histogram"][dataset.input_id] = list(
|
121
|
-
dataset.df[score_column]
|
122
|
-
)
|
67
|
+
fig = _plot_score_histogram(df, score_column, dataset.target_column, title)
|
123
68
|
|
124
|
-
|
125
|
-
dataframes, dataset_titles, score_column, target_column, title
|
126
|
-
)
|
69
|
+
return fig
|
127
70
|
|
128
|
-
figures_list = [
|
129
|
-
Figure(
|
130
|
-
for_object=self,
|
131
|
-
key=f"score_histogram_{title.replace(' ', '_')}_{i+1}",
|
132
|
-
figure=fig,
|
133
|
-
)
|
134
|
-
for i, fig in enumerate(figures)
|
135
|
-
]
|
136
71
|
|
137
|
-
|
72
|
+
def _plot_score_histogram(df, score_col, target_col, title):
|
73
|
+
# Generate a colormap and convert to Plotly-accepted color format
|
74
|
+
# Adjust 'viridis' to any other matplotlib colormap if desired
|
75
|
+
colormap = cm.get_cmap("viridis")
|
76
|
+
|
77
|
+
fig = go.Figure()
|
78
|
+
|
79
|
+
# Get unique classes and assign colors
|
80
|
+
classes = sorted(df[target_col].unique())
|
81
|
+
colors = [colormap(i / len(classes))[:3] for i in range(len(classes))] # RGB
|
82
|
+
color_dict = {
|
83
|
+
cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
|
84
|
+
for cls, rgb in zip(classes, colors)
|
85
|
+
}
|
86
|
+
|
87
|
+
for class_value in sorted(df[target_col].unique()):
|
88
|
+
scores_class = df[df[target_col] == class_value][score_col]
|
89
|
+
fig.add_trace(
|
90
|
+
go.Histogram(
|
91
|
+
x=scores_class,
|
92
|
+
opacity=0.75,
|
93
|
+
name=f"{target_col} = {class_value}",
|
94
|
+
marker=dict(
|
95
|
+
color=color_dict[class_value],
|
96
|
+
),
|
97
|
+
)
|
98
|
+
)
|
99
|
+
fig.update_layout(
|
100
|
+
barmode="overlay",
|
101
|
+
title_text=f"{title}",
|
102
|
+
xaxis_title="Score",
|
103
|
+
yaxis_title="Frequency",
|
104
|
+
)
|
105
|
+
return fig
|
@@ -16,37 +16,41 @@ def FeatureDrift(
|
|
16
16
|
datasets, bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], feature_columns=None
|
17
17
|
):
|
18
18
|
"""
|
19
|
-
|
19
|
+
Evaluates changes in feature distribution over time to identify potential model drift.
|
20
|
+
|
21
|
+
### Purpose
|
20
22
|
|
21
23
|
The Feature Drift test aims to evaluate how much the distribution of features has shifted over time between two
|
22
24
|
datasets, typically training and monitoring datasets. It uses the Population Stability Index (PSI) to quantify this
|
23
|
-
change, providing insights into the model
|
25
|
+
change, providing insights into the model’s robustness and the necessity for retraining or feature engineering.
|
24
26
|
|
25
|
-
|
27
|
+
### Test Mechanism
|
26
28
|
|
27
29
|
This test calculates the PSI by:
|
30
|
+
|
28
31
|
- Bucketing the distributions of each feature in both datasets.
|
29
32
|
- Comparing the percentage of observations in each bucket between the two datasets.
|
30
33
|
- Aggregating the differences across all buckets for each feature to produce the PSI score for that feature.
|
31
34
|
|
32
35
|
The PSI score is interpreted as:
|
36
|
+
|
33
37
|
- PSI < 0.1: No significant population change.
|
34
38
|
- PSI < 0.2: Moderate population change.
|
35
39
|
- PSI >= 0.2: Significant population change.
|
36
40
|
|
37
|
-
|
41
|
+
### Signs of High Risk
|
38
42
|
|
39
43
|
- PSI >= 0.2 for any feature, indicating a significant distribution shift.
|
40
44
|
- Consistently high PSI scores across multiple features.
|
41
45
|
- Sudden spikes in PSI in recent monitoring data compared to historical data.
|
42
46
|
|
43
|
-
|
47
|
+
### Strengths
|
44
48
|
|
45
49
|
- Provides a quantitative measure of feature distribution changes.
|
46
50
|
- Easily interpretable thresholds for decision-making.
|
47
51
|
- Helps in early detection of data drift, prompting timely interventions.
|
48
52
|
|
49
|
-
|
53
|
+
### Limitations
|
50
54
|
|
51
55
|
- May not capture more intricate changes in data distribution nuances.
|
52
56
|
- Assumes that bucket thresholds (quantiles) adequately represent distribution shifts.
|