validmind 2.5.8__py3-none-any.whl → 2.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +26 -7
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +3 -13
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +27 -20
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +36 -35
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +35 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/result_wrapper.py +93 -132
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -2,136 +2,104 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import plotly.graph_objects as go
|
8
6
|
from matplotlib import cm
|
9
7
|
|
10
|
-
from validmind
|
8
|
+
from validmind import tags, tasks
|
11
9
|
|
12
10
|
|
13
|
-
@
|
14
|
-
|
11
|
+
@tags("visualization", "credit_risk", "logistic_regression")
|
12
|
+
@tasks("classification")
|
13
|
+
def ScorecardHistogram(dataset, title="Histogram of Scores", score_column="score"):
|
15
14
|
"""
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
15
|
+
The Scorecard Histogram test evaluates the distribution of credit scores between default and non-default instances,
|
16
|
+
providing critical insights into the performance and generalizability of credit-risk models.
|
17
|
+
|
18
|
+
### Purpose
|
19
|
+
|
20
|
+
The Scorecard Histogram test metric provides a visual interpretation of the credit scores generated by a machine
|
21
|
+
learning model for credit-risk classification tasks. It aims to compare the alignment of the model's scoring
|
22
|
+
decisions with the actual outcomes of credit loan applications. It helps in identifying potential discrepancies
|
23
|
+
between the model's predictions and real-world risk levels.
|
24
|
+
|
25
|
+
### Test Mechanism
|
26
|
+
|
27
|
+
This metric uses logistic regression to generate a histogram of credit scores for both default (negative class) and
|
28
|
+
non-default (positive class) instances. Using both training and test datasets, the metric calculates the credit
|
29
|
+
score of each instance with a scorecard method, considering the impact of different features on the likelihood of
|
30
|
+
default. It includes the default point to odds (PDO) scaling factor and predefined target score and odds settings.
|
31
|
+
Histograms for training and test sets are computed and plotted separately to offer insights into the model's
|
32
|
+
generalizability to unseen data.
|
33
|
+
|
34
|
+
### Signs of High Risk
|
35
|
+
|
36
|
+
- Discrepancies between the distributions of training and testing data, indicating a model's poor generalization
|
32
37
|
ability
|
33
|
-
- Skewed distributions
|
38
|
+
- Skewed distributions favoring specific scores or classes, representing potential bias
|
39
|
+
|
40
|
+
### Strengths
|
34
41
|
|
35
|
-
**Strengths**:
|
36
42
|
- Provides a visual interpretation of the model's credit scoring system, enhancing comprehension of model behavior
|
37
43
|
- Enables a direct comparison between actual and predicted scores for both training and testing data
|
38
44
|
- Its intuitive visualization helps understand the model's ability to differentiate between positive and negative
|
39
45
|
classes
|
40
46
|
- Can unveil patterns or anomalies not easily discerned through numerical metrics alone
|
41
47
|
|
42
|
-
|
43
|
-
|
48
|
+
### Limitations
|
49
|
+
|
50
|
+
- Despite its value for visual interpretation, it doesn't quantify the performance of the model and therefore may
|
44
51
|
lack precision for thorough model evaluation
|
45
52
|
- The quality of input data can strongly influence the metric, as bias or noise in the data will affect both the
|
46
53
|
score calculation and resultant histogram
|
47
54
|
- Its specificity to credit scoring models limits its applicability across a wider variety of machine learning
|
48
55
|
tasks and models
|
49
|
-
- The metric's effectiveness is somewhat tied to the subjective interpretation of the analyst,
|
50
|
-
|
56
|
+
- The metric's effectiveness is somewhat tied to the subjective interpretation of the analyst, relying on their
|
57
|
+
judgment of the characteristics and implications of the plot.
|
51
58
|
"""
|
52
59
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
60
|
+
if score_column not in dataset.df.columns:
|
61
|
+
raise ValueError(
|
62
|
+
f"The required column '{score_column}' is not present in the dataset with input_id {dataset.input_id}"
|
63
|
+
)
|
57
64
|
|
58
|
-
|
59
|
-
"title": "Histogram of Scores",
|
60
|
-
"score_column": "score",
|
61
|
-
}
|
65
|
+
df = dataset.df
|
62
66
|
|
63
|
-
|
64
|
-
def plot_score_histogram(dataframes, dataset_titles, score_col, target_col, title):
|
65
|
-
figures = []
|
66
|
-
# Generate a colormap and convert to Plotly-accepted color format
|
67
|
-
# Adjust 'viridis' to any other matplotlib colormap if desired
|
68
|
-
colormap = cm.get_cmap("viridis")
|
69
|
-
|
70
|
-
for _, (df, dataset_title) in enumerate(zip(dataframes, dataset_titles)):
|
71
|
-
fig = go.Figure()
|
72
|
-
|
73
|
-
# Get unique classes and assign colors
|
74
|
-
classes = sorted(df[target_col].unique())
|
75
|
-
colors = [
|
76
|
-
colormap(i / len(classes))[:3] for i in range(len(classes))
|
77
|
-
] # RGB
|
78
|
-
color_dict = {
|
79
|
-
cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
|
80
|
-
for cls, rgb in zip(classes, colors)
|
81
|
-
}
|
82
|
-
|
83
|
-
for class_value in sorted(df[target_col].unique()):
|
84
|
-
scores_class = df[df[target_col] == class_value][score_col]
|
85
|
-
fig.add_trace(
|
86
|
-
go.Histogram(
|
87
|
-
x=scores_class,
|
88
|
-
opacity=0.75,
|
89
|
-
name=f"{dataset_title} {target_col} = {class_value}",
|
90
|
-
marker=dict(
|
91
|
-
color=color_dict[class_value],
|
92
|
-
),
|
93
|
-
)
|
94
|
-
)
|
95
|
-
fig.update_layout(
|
96
|
-
barmode="overlay",
|
97
|
-
title_text=f"{title} - {dataset_title}",
|
98
|
-
xaxis_title="Score",
|
99
|
-
yaxis_title="Frequency",
|
100
|
-
legend_title=target_col,
|
101
|
-
)
|
102
|
-
figures.append(fig)
|
103
|
-
return figures
|
104
|
-
|
105
|
-
def run(self):
|
106
|
-
title = self.params["title"]
|
107
|
-
score_column = self.params["score_column"]
|
108
|
-
dataset_titles = [dataset.input_id for dataset in self.inputs.datasets]
|
109
|
-
target_column = self.inputs.datasets[0].target_column
|
110
|
-
|
111
|
-
dataframes = []
|
112
|
-
metric_value = {"score_histogram": {}}
|
113
|
-
for dataset in self.inputs.datasets:
|
114
|
-
if score_column not in dataset.df.columns:
|
115
|
-
raise ValueError(
|
116
|
-
f"The required column '{score_column}' is not present in the dataset with input_id {dataset.input_id}"
|
117
|
-
)
|
118
|
-
|
119
|
-
dataframes.append(dataset.df.copy())
|
120
|
-
metric_value["score_histogram"][dataset.input_id] = list(
|
121
|
-
dataset.df[score_column]
|
122
|
-
)
|
67
|
+
fig = _plot_score_histogram(df, score_column, dataset.target_column, title)
|
123
68
|
|
124
|
-
|
125
|
-
dataframes, dataset_titles, score_column, target_column, title
|
126
|
-
)
|
69
|
+
return fig
|
127
70
|
|
128
|
-
figures_list = [
|
129
|
-
Figure(
|
130
|
-
for_object=self,
|
131
|
-
key=f"score_histogram_{title.replace(' ', '_')}_{i+1}",
|
132
|
-
figure=fig,
|
133
|
-
)
|
134
|
-
for i, fig in enumerate(figures)
|
135
|
-
]
|
136
71
|
|
137
|
-
|
72
|
+
def _plot_score_histogram(df, score_col, target_col, title):
|
73
|
+
# Generate a colormap and convert to Plotly-accepted color format
|
74
|
+
# Adjust 'viridis' to any other matplotlib colormap if desired
|
75
|
+
colormap = cm.get_cmap("viridis")
|
76
|
+
|
77
|
+
fig = go.Figure()
|
78
|
+
|
79
|
+
# Get unique classes and assign colors
|
80
|
+
classes = sorted(df[target_col].unique())
|
81
|
+
colors = [colormap(i / len(classes))[:3] for i in range(len(classes))] # RGB
|
82
|
+
color_dict = {
|
83
|
+
cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
|
84
|
+
for cls, rgb in zip(classes, colors)
|
85
|
+
}
|
86
|
+
|
87
|
+
for class_value in sorted(df[target_col].unique()):
|
88
|
+
scores_class = df[df[target_col] == class_value][score_col]
|
89
|
+
fig.add_trace(
|
90
|
+
go.Histogram(
|
91
|
+
x=scores_class,
|
92
|
+
opacity=0.75,
|
93
|
+
name=f"{target_col} = {class_value}",
|
94
|
+
marker=dict(
|
95
|
+
color=color_dict[class_value],
|
96
|
+
),
|
97
|
+
)
|
98
|
+
)
|
99
|
+
fig.update_layout(
|
100
|
+
barmode="overlay",
|
101
|
+
title_text=f"{title}",
|
102
|
+
xaxis_title="Score",
|
103
|
+
yaxis_title="Frequency",
|
104
|
+
)
|
105
|
+
return fig
|
@@ -11,29 +11,36 @@ class ShapiroWilk(Metric):
|
|
11
11
|
"""
|
12
12
|
Evaluates feature-wise normality of training data using the Shapiro-Wilk test.
|
13
13
|
|
14
|
-
|
15
|
-
|
14
|
+
### Purpose
|
15
|
+
|
16
|
+
The Shapiro-Wilk test is utilized to investigate whether a particular dataset conforms to the standard normal
|
17
|
+
distribution. This analysis is crucial in machine learning modeling because the normality of the data can
|
16
18
|
profoundly impact the performance of the model. This metric is especially useful in evaluating various features of
|
17
19
|
the dataset in both classification and regression tasks.
|
18
20
|
|
19
|
-
|
20
|
-
|
21
|
-
|
21
|
+
### Test Mechanism
|
22
|
+
|
23
|
+
The Shapiro-Wilk test is conducted on each feature column of the training dataset to determine if the data
|
24
|
+
contained fall within the normal distribution. The test presents a statistic and a p-value, with the p-value
|
25
|
+
serving to validate or repudiate the null hypothesis, which is that the tested data is normally distributed.
|
26
|
+
|
27
|
+
### Signs of High Risk
|
22
28
|
|
23
|
-
**Signs of High Risk**:
|
24
29
|
- A p-value that falls below 0.05 signifies a high risk as it discards the null hypothesis, indicating that the
|
25
30
|
data does not adhere to the normal distribution.
|
26
31
|
- For machine learning models built on the presumption of data normality, such an outcome could result in subpar
|
27
32
|
performance or incorrect predictions.
|
28
33
|
|
29
|
-
|
34
|
+
### Strengths
|
35
|
+
|
30
36
|
- The Shapiro-Wilk test is esteemed for its level of accuracy, thereby making it particularly well-suited to
|
31
37
|
datasets of small to moderate sizes.
|
32
38
|
- It proves its versatility through its efficient functioning in both classification and regression tasks.
|
33
39
|
- By separately testing each feature column, the Shapiro-Wilk test can raise an alarm if a specific feature does
|
34
40
|
not comply with the normality.
|
35
41
|
|
36
|
-
|
42
|
+
### Limitations
|
43
|
+
|
37
44
|
- The Shapiro-Wilk test's sensitivity can be a disadvantage as it often rejects the null hypothesis (i.e., data is
|
38
45
|
normally distributed), even for minor deviations, especially in large datasets. This may lead to unwarranted 'false
|
39
46
|
alarms' of high risk by deeming the data as not normally distributed even if it approximates normal distribution.
|
@@ -16,37 +16,41 @@ def FeatureDrift(
|
|
16
16
|
datasets, bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], feature_columns=None
|
17
17
|
):
|
18
18
|
"""
|
19
|
-
|
19
|
+
Evaluates changes in feature distribution over time to identify potential model drift.
|
20
|
+
|
21
|
+
### Purpose
|
20
22
|
|
21
23
|
The Feature Drift test aims to evaluate how much the distribution of features has shifted over time between two
|
22
24
|
datasets, typically training and monitoring datasets. It uses the Population Stability Index (PSI) to quantify this
|
23
|
-
change, providing insights into the model
|
25
|
+
change, providing insights into the model’s robustness and the necessity for retraining or feature engineering.
|
24
26
|
|
25
|
-
|
27
|
+
### Test Mechanism
|
26
28
|
|
27
29
|
This test calculates the PSI by:
|
30
|
+
|
28
31
|
- Bucketing the distributions of each feature in both datasets.
|
29
32
|
- Comparing the percentage of observations in each bucket between the two datasets.
|
30
33
|
- Aggregating the differences across all buckets for each feature to produce the PSI score for that feature.
|
31
34
|
|
32
35
|
The PSI score is interpreted as:
|
36
|
+
|
33
37
|
- PSI < 0.1: No significant population change.
|
34
38
|
- PSI < 0.2: Moderate population change.
|
35
39
|
- PSI >= 0.2: Significant population change.
|
36
40
|
|
37
|
-
|
41
|
+
### Signs of High Risk
|
38
42
|
|
39
43
|
- PSI >= 0.2 for any feature, indicating a significant distribution shift.
|
40
44
|
- Consistently high PSI scores across multiple features.
|
41
45
|
- Sudden spikes in PSI in recent monitoring data compared to historical data.
|
42
46
|
|
43
|
-
|
47
|
+
### Strengths
|
44
48
|
|
45
49
|
- Provides a quantitative measure of feature distribution changes.
|
46
50
|
- Easily interpretable thresholds for decision-making.
|
47
51
|
- Helps in early detection of data drift, prompting timely interventions.
|
48
52
|
|
49
|
-
|
53
|
+
### Limitations
|
50
54
|
|
51
55
|
- May not capture more intricate changes in data distribution nuances.
|
52
56
|
- Assumes that bucket thresholds (quantiles) adequately represent distribution shifts.
|
@@ -12,31 +12,37 @@ from validmind import tags, tasks
|
|
12
12
|
@tasks("monitoring")
|
13
13
|
def PredictionAcrossEachFeature(datasets, model):
|
14
14
|
"""
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
-
|
33
|
-
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
-
|
38
|
-
-
|
39
|
-
-
|
15
|
+
Assesses differences in model predictions across individual features between reference and monitoring datasets
|
16
|
+
through visual analysis.
|
17
|
+
|
18
|
+
### Purpose
|
19
|
+
|
20
|
+
The Prediction Across Each Feature test aims to visually compare model predictions for each feature between
|
21
|
+
reference (training) and monitoring (production) datasets. It helps identify significant differences in prediction
|
22
|
+
patterns for further investigation and ensures the model's consistency and stability over time.
|
23
|
+
|
24
|
+
### Test Mechanism
|
25
|
+
|
26
|
+
The test generates scatter plots for each feature, comparing prediction probabilities between the reference and
|
27
|
+
monitoring datasets. Each plot consists of two subplots: one for reference data and one for monitoring data,
|
28
|
+
enabling visual comparison of the model's predictive behavior.
|
29
|
+
|
30
|
+
### Signs of High Risk
|
31
|
+
|
32
|
+
- Significant discrepancies between the reference and monitoring subplots for the same feature.
|
33
|
+
- Unexpected patterns or trends in monitoring data that were absent in reference data.
|
34
|
+
|
35
|
+
### Strengths
|
36
|
+
|
37
|
+
- Provides a clear visual representation of model performance across different features.
|
38
|
+
- Facilitates easy identification of features where the model's predictions have diverged.
|
39
|
+
- Enables quick detection of potential model performance issues in production.
|
40
|
+
|
41
|
+
### Limitations
|
42
|
+
|
43
|
+
- Interpretation of scatter plots can be subjective and may require expertise.
|
44
|
+
- Visualizations do not provide quantitative metrics for objective evaluation.
|
45
|
+
- May not capture all types of distribution changes or issues with the model's predictions.
|
40
46
|
"""
|
41
47
|
|
42
48
|
"""
|
@@ -13,30 +13,38 @@ from validmind import tags, tasks
|
|
13
13
|
@tasks("monitoring")
|
14
14
|
def PredictionCorrelation(datasets, model):
|
15
15
|
"""
|
16
|
-
|
17
|
-
|
18
|
-
monitoring datasets. The primary goal is to detect significant changes in these pairs, which may signal target
|
19
|
-
drift, leading to lower model performance.
|
16
|
+
Assesses correlation changes between model predictions from reference and monitoring datasets to detect potential
|
17
|
+
target drift.
|
20
18
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
### Purpose
|
20
|
+
|
21
|
+
To evaluate the changes in correlation pairs between model predictions and features from reference and monitoring
|
22
|
+
datasets. This helps in identifying significant shifts that may indicate target drift, potentially affecting model
|
23
|
+
performance.
|
24
|
+
|
25
|
+
### Test Mechanism
|
26
|
+
|
27
|
+
This test calculates the correlation of each feature with model predictions for both reference and monitoring
|
28
|
+
datasets. It then compares these correlations side-by-side using a bar plot and a correlation table. Significant
|
29
|
+
changes in correlation pairs are highlighted to signal possible model drift.
|
30
|
+
|
31
|
+
### Signs of High Risk
|
25
32
|
|
26
|
-
**Signs of High Risk:**
|
27
33
|
- Significant changes in correlation pairs between the reference and monitoring predictions.
|
28
|
-
- Notable correlation
|
29
|
-
variable.
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
-
|
34
|
-
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
-
|
34
|
+
- Notable differences in correlation values, indicating a possible shift in the relationship between features and
|
35
|
+
the target variable.
|
36
|
+
|
37
|
+
### Strengths
|
38
|
+
|
39
|
+
- Provides visual identification of drift in feature relationships with model predictions.
|
40
|
+
- Clear bar plot comparison aids in understanding model stability over time.
|
41
|
+
- Enables early detection of target drift, facilitating timely interventions.
|
42
|
+
|
43
|
+
### Limitations
|
44
|
+
|
45
|
+
- Requires substantial reference and monitoring data for accurate comparison.
|
46
|
+
- Correlation does not imply causation; other factors may influence changes.
|
47
|
+
- Focuses solely on linear relationships, potentially missing non-linear interactions.
|
40
48
|
"""
|
41
49
|
|
42
50
|
prediction_prob_column = f"{model.input_id}_probabilities"
|
@@ -12,29 +12,37 @@ from validmind import tags, tasks
|
|
12
12
|
@tasks("monitoring")
|
13
13
|
def TargetPredictionDistributionPlot(datasets, model):
|
14
14
|
"""
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
-
|
15
|
+
Assesses differences in prediction distributions between a reference dataset and a monitoring dataset to identify
|
16
|
+
potential data drift.
|
17
|
+
|
18
|
+
### Purpose
|
19
|
+
|
20
|
+
The Target Prediction Distribution Plot test aims to evaluate potential changes in the prediction distributions
|
21
|
+
between the reference and new monitoring datasets. It seeks to identify underlying shifts in data characteristics
|
22
|
+
that warrant further investigation.
|
23
|
+
|
24
|
+
### Test Mechanism
|
25
|
+
|
26
|
+
This test generates Kernel Density Estimation (KDE) plots for prediction probabilities from both the reference and
|
27
|
+
monitoring datasets. By visually comparing the KDE plots, it assesses significant differences in the prediction
|
28
|
+
distributions between the two datasets.
|
29
|
+
|
30
|
+
### Signs of High Risk
|
31
|
+
|
32
|
+
- Significant divergence between the distribution curves of reference and monitoring predictions.
|
33
|
+
- Unusual shifts or bimodal distribution in the monitoring predictions compared to the reference predictions.
|
34
|
+
|
35
|
+
### Strengths
|
36
|
+
|
37
|
+
- Visual representation makes it easy to spot differences in prediction distributions.
|
38
|
+
- Useful for identifying potential data drift or changes in underlying data characteristics.
|
39
|
+
- Simple and efficient to implement using standard plotting libraries.
|
40
|
+
|
41
|
+
### Limitations
|
42
|
+
|
43
|
+
- Subjective interpretation of the visual plots.
|
44
|
+
- Might not pinpoint the exact cause of distribution changes.
|
45
|
+
- Less effective if the differences in distributions are subtle and not easily visible.
|
38
46
|
"""
|
39
47
|
|
40
48
|
pred_ref = datasets[0].y_prob_df(model)
|
@@ -27,42 +27,45 @@ from .ai_powered_test import (
|
|
27
27
|
@dataclass
|
28
28
|
class Bias(ThresholdTest):
|
29
29
|
"""
|
30
|
-
|
30
|
+
Assesses potential bias in a Large Language Model by analyzing the distribution and order of exemplars in the
|
31
|
+
prompt.
|
32
|
+
|
33
|
+
### Purpose
|
31
34
|
|
32
|
-
**Purpose:**
|
33
35
|
The Bias Evaluation test calculates if and how the order and distribution of exemplars (examples) in a few-shot
|
34
36
|
learning prompt affect the output of a Large Language Model (LLM). The results of this evaluation can be used to
|
35
37
|
fine-tune the model's performance and manage any unintended biases in its results.
|
36
38
|
|
37
|
-
|
39
|
+
### Test Mechanism
|
40
|
+
|
38
41
|
This test uses two checks:
|
39
42
|
|
40
|
-
1.
|
43
|
+
1. **Distribution of Exemplars:** The number of positive vs. negative examples in a prompt is varied. The test then
|
41
44
|
examines the LLM's classification of a neutral or ambiguous statement under these circumstances.
|
42
|
-
2.
|
45
|
+
2. **Order of Exemplars:** The sequence in which positive and negative examples are presented to the model is
|
43
46
|
modified. Their resultant effect on the LLM's response is studied.
|
44
47
|
|
45
48
|
For each test case, the LLM grades the input prompt on a scale of 1 to 10. It evaluates whether the examples in the
|
46
49
|
prompt could produce biased responses. The test only passes if the score meets or exceeds a predetermined minimum
|
47
|
-
threshold. This threshold is set at 7 by default
|
50
|
+
threshold. This threshold is set at 7 by default but can be modified as per the requirements via the test
|
48
51
|
parameters.
|
49
52
|
|
50
|
-
|
53
|
+
### Signs of High Risk
|
51
54
|
|
52
55
|
- A skewed result favoring either positive or negative responses may suggest potential bias in the model. This skew
|
53
56
|
could be caused by an unbalanced distribution of positive and negative exemplars.
|
54
57
|
- If the score given by the model is less than the set minimum threshold, it might indicate a risk of high bias and
|
55
58
|
hence poor performance.
|
56
59
|
|
57
|
-
|
60
|
+
### Strengths
|
58
61
|
|
59
|
-
- This test provides a quantitative measure of potential bias,
|
62
|
+
- This test provides a quantitative measure of potential bias, offering clear guidelines for developers about
|
60
63
|
whether their Large Language Model (LLM) contains significant bias.
|
61
|
-
- It
|
64
|
+
- It is useful in evaluating the impartiality of the model based on the distribution and sequence of examples.
|
62
65
|
- The flexibility to adjust the minimum required threshold allows tailoring this test to stricter or more lenient
|
63
66
|
bias standards.
|
64
67
|
|
65
|
-
|
68
|
+
### Limitations
|
66
69
|
|
67
70
|
- The test may not pick up on more subtle forms of bias or biases that are not directly related to the distribution
|
68
71
|
or order of exemplars.
|
@@ -29,36 +29,38 @@ class Clarity(ThresholdTest):
|
|
29
29
|
"""
|
30
30
|
Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
|
31
31
|
|
32
|
-
|
32
|
+
### Purpose
|
33
|
+
|
33
34
|
The Clarity evaluation metric is used to assess how clear the prompts of a Large Language Model (LLM) are. This
|
34
35
|
assessment is particularly important because clear prompts assist the LLM in more accurately interpreting and
|
35
36
|
responding to instructions.
|
36
37
|
|
37
|
-
|
38
|
+
### Test Mechanism
|
39
|
+
|
38
40
|
The evaluation uses an LLM to scrutinize the clarity of prompts, factoring in considerations such as the inclusion
|
39
|
-
of relevant details, persona adoption, step-by-step instructions, usage of examples and specification of desired
|
41
|
+
of relevant details, persona adoption, step-by-step instructions, usage of examples, and specification of desired
|
40
42
|
output length. Each prompt is rated on a clarity scale of 1 to 10, and any prompt scoring at or above the preset
|
41
43
|
threshold (default of 7) will be marked as clear. It is important to note that this threshold can be adjusted via
|
42
44
|
test parameters, providing flexibility in the evaluation process.
|
43
45
|
|
44
|
-
|
46
|
+
### Signs of High Risk
|
45
47
|
|
46
48
|
- Prompts that consistently score below the clarity threshold
|
47
|
-
- Repeated failure of prompts to adhere to guidelines for clarity
|
48
|
-
|
49
|
+
- Repeated failure of prompts to adhere to guidelines for clarity, including detail inclusion, persona adoption,
|
50
|
+
explicit step-by-step instructions, use of examples, and specification of output length
|
49
51
|
|
50
|
-
|
52
|
+
### Strengths
|
51
53
|
|
52
|
-
- Encourages the development of more effective prompts that aid the LLM in interpreting instructions accurately
|
53
|
-
- Applies a quantifiable measure (a score from 1 to 10) to evaluate the clarity of prompts
|
54
|
-
- Threshold for clarity is adjustable, allowing for flexible evaluation depending on the context
|
54
|
+
- Encourages the development of more effective prompts that aid the LLM in interpreting instructions accurately
|
55
|
+
- Applies a quantifiable measure (a score from 1 to 10) to evaluate the clarity of prompts
|
56
|
+
- Threshold for clarity is adjustable, allowing for flexible evaluation depending on the context
|
55
57
|
|
56
|
-
|
58
|
+
### Limitations
|
57
59
|
|
58
|
-
- Scoring system is subjective and relies on the AI’s interpretation of 'clarity'
|
60
|
+
- Scoring system is subjective and relies on the AI’s interpretation of 'clarity'
|
59
61
|
- The test assumes that all required factors (detail inclusion, persona adoption, step-by-step instructions, use of
|
60
|
-
examples, and specification of output length) contribute equally to clarity, which might not always be the case
|
61
|
-
- The evaluation may not be as effective if used on non-textual models
|
62
|
+
examples, and specification of output length) contribute equally to clarity, which might not always be the case
|
63
|
+
- The evaluation may not be as effective if used on non-textual models
|
62
64
|
"""
|
63
65
|
|
64
66
|
name = "clarity"
|