validmind 2.8.10__py3-none-any.whl → 2.8.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +6 -5
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +17 -11
- validmind/ai/utils.py +2 -2
- validmind/api_client.py +75 -32
- validmind/client.py +108 -100
- validmind/client_config.py +3 -3
- validmind/datasets/classification/__init__.py +7 -3
- validmind/datasets/credit_risk/lending_club.py +28 -16
- validmind/datasets/nlp/cnn_dailymail.py +10 -4
- validmind/datasets/regression/__init__.py +22 -5
- validmind/errors.py +17 -7
- validmind/input_registry.py +1 -1
- validmind/logging.py +44 -35
- validmind/models/foundation.py +2 -2
- validmind/models/function.py +10 -3
- validmind/template.py +30 -22
- validmind/test_suites/__init__.py +2 -2
- validmind/tests/_store.py +13 -4
- validmind/tests/comparison.py +65 -33
- validmind/tests/data_validation/ACFandPACFPlot.py +4 -1
- validmind/tests/data_validation/AutoMA.py +1 -1
- validmind/tests/data_validation/BivariateScatterPlots.py +5 -1
- validmind/tests/data_validation/BoxPierce.py +3 -1
- validmind/tests/data_validation/ClassImbalance.py +4 -2
- validmind/tests/data_validation/DatasetDescription.py +3 -24
- validmind/tests/data_validation/DescriptiveStatistics.py +1 -1
- validmind/tests/data_validation/DickeyFullerGLS.py +1 -1
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +1 -1
- validmind/tests/data_validation/HighCardinality.py +5 -1
- validmind/tests/data_validation/HighPearsonCorrelation.py +1 -1
- validmind/tests/data_validation/IQROutliersBarPlot.py +5 -3
- validmind/tests/data_validation/IQROutliersTable.py +5 -2
- validmind/tests/data_validation/IsolationForestOutliers.py +5 -4
- validmind/tests/data_validation/JarqueBera.py +2 -2
- validmind/tests/data_validation/LJungBox.py +2 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
- validmind/tests/data_validation/MissingValues.py +14 -10
- validmind/tests/data_validation/MissingValuesBarPlot.py +3 -1
- validmind/tests/data_validation/MutualInformation.py +2 -1
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +1 -1
- validmind/tests/data_validation/ProtectedClassesCombination.py +2 -0
- validmind/tests/data_validation/ProtectedClassesDescription.py +2 -2
- validmind/tests/data_validation/ProtectedClassesDisparity.py +9 -5
- validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +10 -2
- validmind/tests/data_validation/RollingStatsPlot.py +2 -1
- validmind/tests/data_validation/ScoreBandDefaultRates.py +4 -2
- validmind/tests/data_validation/SeasonalDecompose.py +1 -1
- validmind/tests/data_validation/ShapiroWilk.py +2 -2
- validmind/tests/data_validation/Skewness.py +7 -6
- validmind/tests/data_validation/SpreadPlot.py +1 -1
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +1 -1
- validmind/tests/data_validation/TabularDateTimeHistograms.py +1 -1
- validmind/tests/data_validation/TargetRateBarPlots.py +4 -1
- validmind/tests/data_validation/TimeSeriesFrequency.py +1 -1
- validmind/tests/data_validation/TimeSeriesOutliers.py +7 -2
- validmind/tests/data_validation/WOEBinPlots.py +1 -1
- validmind/tests/data_validation/WOEBinTable.py +1 -1
- validmind/tests/data_validation/ZivotAndrewsArch.py +5 -2
- validmind/tests/data_validation/nlp/CommonWords.py +1 -1
- validmind/tests/data_validation/nlp/Hashtags.py +1 -1
- validmind/tests/data_validation/nlp/LanguageDetection.py +1 -1
- validmind/tests/data_validation/nlp/Mentions.py +1 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +5 -1
- validmind/tests/data_validation/nlp/Punctuations.py +1 -1
- validmind/tests/data_validation/nlp/Sentiment.py +3 -1
- validmind/tests/data_validation/nlp/TextDescription.py +1 -1
- validmind/tests/data_validation/nlp/Toxicity.py +1 -1
- validmind/tests/decorator.py +14 -11
- validmind/tests/load.py +38 -24
- validmind/tests/model_validation/BertScore.py +7 -1
- validmind/tests/model_validation/BleuScore.py +7 -1
- validmind/tests/model_validation/ClusterSizeDistribution.py +3 -1
- validmind/tests/model_validation/ContextualRecall.py +9 -1
- validmind/tests/model_validation/FeaturesAUC.py +1 -1
- validmind/tests/model_validation/MeteorScore.py +7 -1
- validmind/tests/model_validation/ModelPredictionResiduals.py +5 -1
- validmind/tests/model_validation/RegardScore.py +6 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -1
- validmind/tests/model_validation/RougeScore.py +3 -1
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +2 -0
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +10 -2
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +6 -2
- validmind/tests/model_validation/TokenDisparity.py +5 -1
- validmind/tests/model_validation/ToxicityScore.py +2 -0
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +5 -1
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +5 -1
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +5 -1
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +2 -0
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +5 -1
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +6 -2
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +3 -1
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +5 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +5 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +5 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +5 -1
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +6 -1
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -3
- validmind/tests/model_validation/ragas/AspectCritic.py +4 -1
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +5 -3
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -3
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +5 -3
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -3
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -3
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +1 -1
- validmind/tests/model_validation/ragas/ResponseRelevancy.py +5 -3
- validmind/tests/model_validation/ragas/SemanticSimilarity.py +5 -3
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +9 -9
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +9 -9
- validmind/tests/model_validation/sklearn/CalibrationCurve.py +5 -2
- validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +28 -5
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +5 -1
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +24 -14
- validmind/tests/model_validation/sklearn/CompletenessScore.py +8 -9
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -3
- validmind/tests/model_validation/sklearn/FeatureImportance.py +6 -2
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -9
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +14 -9
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +4 -2
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +6 -1
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +12 -7
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +12 -7
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +21 -6
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +11 -3
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +5 -1
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -1
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +6 -1
- validmind/tests/model_validation/sklearn/ROCCurve.py +3 -1
- validmind/tests/model_validation/sklearn/RegressionErrors.py +6 -2
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +13 -8
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +8 -5
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +5 -1
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +34 -26
- validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +10 -2
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +5 -1
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -9
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +15 -10
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +5 -1
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +6 -1
- validmind/tests/model_validation/statsmodels/GINITable.py +8 -1
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +2 -2
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +6 -2
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +8 -2
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +3 -1
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +7 -2
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -0
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +2 -0
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +4 -2
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +3 -1
- validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py +11 -1
- validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py +10 -2
- validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py +8 -1
- validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py +18 -2
- validmind/tests/ongoing_monitoring/FeatureDrift.py +9 -2
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +8 -2
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +13 -2
- validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +13 -2
- validmind/tests/ongoing_monitoring/ROCCurveDrift.py +16 -2
- validmind/tests/ongoing_monitoring/ScoreBandsDrift.py +11 -2
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +13 -2
- validmind/tests/output.py +66 -11
- validmind/tests/prompt_validation/Clarity.py +1 -1
- validmind/tests/prompt_validation/NegativeInstruction.py +1 -1
- validmind/tests/prompt_validation/Robustness.py +6 -1
- validmind/tests/prompt_validation/Specificity.py +1 -1
- validmind/tests/run.py +28 -14
- validmind/tests/test_providers.py +28 -35
- validmind/tests/utils.py +17 -4
- validmind/unit_metrics/__init__.py +1 -1
- validmind/utils.py +295 -31
- validmind/vm_models/dataset/dataset.py +19 -16
- validmind/vm_models/dataset/utils.py +5 -3
- validmind/vm_models/figure.py +6 -6
- validmind/vm_models/input.py +6 -5
- validmind/vm_models/model.py +5 -5
- validmind/vm_models/result/result.py +122 -43
- validmind/vm_models/result/utils.py +9 -28
- validmind/vm_models/test_suite/__init__.py +5 -0
- validmind/vm_models/test_suite/runner.py +5 -5
- validmind/vm_models/test_suite/summary.py +20 -2
- validmind/vm_models/test_suite/test.py +6 -6
- validmind/vm_models/test_suite/test_suite.py +10 -10
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/METADATA +4 -5
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/RECORD +189 -188
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/WHEEL +1 -1
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/LICENSE +0 -0
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/entry_points.txt +0 -0
@@ -118,8 +118,10 @@ def ContextEntityRecall(
|
|
118
118
|
|
119
119
|
score_column = "context_entity_recall"
|
120
120
|
|
121
|
-
fig_histogram = px.histogram(
|
122
|
-
|
121
|
+
fig_histogram = px.histogram(
|
122
|
+
x=result_df[score_column].to_list(), nbins=10, title="Context Entity Recall"
|
123
|
+
)
|
124
|
+
fig_box = px.box(x=result_df[score_column].to_list(), title="Context Entity Recall")
|
123
125
|
|
124
126
|
return (
|
125
127
|
{
|
@@ -143,5 +145,5 @@ def ContextEntityRecall(
|
|
143
145
|
},
|
144
146
|
fig_histogram,
|
145
147
|
fig_box,
|
146
|
-
RawData(evaluation_results=result_df),
|
148
|
+
RawData(evaluation_results=result_df, dataset=dataset.input_id),
|
147
149
|
)
|
@@ -114,8 +114,10 @@ def ContextPrecision(
|
|
114
114
|
|
115
115
|
score_column = "llm_context_precision_with_reference"
|
116
116
|
|
117
|
-
fig_histogram = px.histogram(
|
118
|
-
|
117
|
+
fig_histogram = px.histogram(
|
118
|
+
x=result_df[score_column].to_list(), nbins=10, title="Context Precision"
|
119
|
+
)
|
120
|
+
fig_box = px.box(x=result_df[score_column].to_list(), title="Context Precision")
|
119
121
|
|
120
122
|
return (
|
121
123
|
{
|
@@ -135,5 +137,5 @@ def ContextPrecision(
|
|
135
137
|
},
|
136
138
|
fig_histogram,
|
137
139
|
fig_box,
|
138
|
-
RawData(evaluation_results=result_df),
|
140
|
+
RawData(evaluation_results=result_df, dataset=dataset.input_id),
|
139
141
|
)
|
@@ -109,8 +109,10 @@ def ContextPrecisionWithoutReference(
|
|
109
109
|
|
110
110
|
score_column = "llm_context_precision_without_reference"
|
111
111
|
|
112
|
-
fig_histogram = px.histogram(
|
113
|
-
|
112
|
+
fig_histogram = px.histogram(
|
113
|
+
x=result_df[score_column].to_list(), nbins=10, title="Context Precision"
|
114
|
+
)
|
115
|
+
fig_box = px.box(x=result_df[score_column].to_list(), title="Context Precision")
|
114
116
|
|
115
117
|
return (
|
116
118
|
{
|
@@ -130,5 +132,5 @@ def ContextPrecisionWithoutReference(
|
|
130
132
|
},
|
131
133
|
fig_histogram,
|
132
134
|
fig_box,
|
133
|
-
RawData(evaluation_results=result_df),
|
135
|
+
RawData(evaluation_results=result_df, dataset=dataset.input_id),
|
134
136
|
)
|
@@ -114,8 +114,10 @@ def ContextRecall(
|
|
114
114
|
|
115
115
|
score_column = "context_recall"
|
116
116
|
|
117
|
-
fig_histogram = px.histogram(
|
118
|
-
|
117
|
+
fig_histogram = px.histogram(
|
118
|
+
x=result_df[score_column].to_list(), nbins=10, title="Context Recall"
|
119
|
+
)
|
120
|
+
fig_box = px.box(x=result_df[score_column].to_list(), title="Context Recall")
|
119
121
|
|
120
122
|
return (
|
121
123
|
{
|
@@ -135,5 +137,5 @@ def ContextRecall(
|
|
135
137
|
},
|
136
138
|
fig_histogram,
|
137
139
|
fig_box,
|
138
|
-
RawData(evaluation_results=result_df),
|
140
|
+
RawData(evaluation_results=result_df, dataset=dataset.input_id),
|
139
141
|
)
|
@@ -119,8 +119,10 @@ def Faithfulness(
|
|
119
119
|
|
120
120
|
score_column = "faithfulness"
|
121
121
|
|
122
|
-
fig_histogram = px.histogram(
|
123
|
-
|
122
|
+
fig_histogram = px.histogram(
|
123
|
+
x=result_df[score_column].to_list(), nbins=10, title="Faithfulness"
|
124
|
+
)
|
125
|
+
fig_box = px.box(x=result_df[score_column].to_list(), title="Faithfulness")
|
124
126
|
|
125
127
|
return (
|
126
128
|
{
|
@@ -140,5 +142,5 @@ def Faithfulness(
|
|
140
142
|
},
|
141
143
|
fig_histogram,
|
142
144
|
fig_box,
|
143
|
-
RawData(evaluation_results=result_df),
|
145
|
+
RawData(evaluation_results=result_df, dataset=dataset.input_id),
|
144
146
|
)
|
@@ -133,8 +133,10 @@ def ResponseRelevancy(
|
|
133
133
|
|
134
134
|
score_column = "answer_relevancy"
|
135
135
|
|
136
|
-
fig_histogram = px.histogram(
|
137
|
-
|
136
|
+
fig_histogram = px.histogram(
|
137
|
+
x=result_df[score_column].to_list(), nbins=10, title="Response Relevancy"
|
138
|
+
)
|
139
|
+
fig_box = px.box(x=result_df[score_column].to_list(), title="Response Relevancy")
|
138
140
|
|
139
141
|
return (
|
140
142
|
{
|
@@ -154,5 +156,5 @@ def ResponseRelevancy(
|
|
154
156
|
},
|
155
157
|
fig_histogram,
|
156
158
|
fig_box,
|
157
|
-
RawData(evaluation_results=result_df),
|
159
|
+
RawData(evaluation_results=result_df, dataset=dataset.input_id),
|
158
160
|
)
|
@@ -112,8 +112,10 @@ def SemanticSimilarity(
|
|
112
112
|
|
113
113
|
score_column = "semantic_similarity"
|
114
114
|
|
115
|
-
fig_histogram = px.histogram(
|
116
|
-
|
115
|
+
fig_histogram = px.histogram(
|
116
|
+
x=result_df[score_column].to_list(), nbins=10, title="Semantic Similarity"
|
117
|
+
)
|
118
|
+
fig_box = px.box(x=result_df[score_column].to_list(), title="Semantic Similarity")
|
117
119
|
|
118
120
|
return (
|
119
121
|
{
|
@@ -133,5 +135,5 @@ def SemanticSimilarity(
|
|
133
135
|
},
|
134
136
|
fig_histogram,
|
135
137
|
fig_box,
|
136
|
-
RawData(evaluation_results=result_df),
|
138
|
+
RawData(evaluation_results=result_df, dataset=dataset.input_id),
|
137
139
|
)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
from sklearn.metrics import adjusted_mutual_info_score
|
6
6
|
|
7
|
-
from validmind import tags, tasks
|
7
|
+
from validmind import RawData, tags, tasks
|
8
8
|
from validmind.vm_models import VMDataset, VMModel
|
9
9
|
|
10
10
|
|
@@ -52,11 +52,11 @@ def AdjustedMutualInformation(model: VMModel, dataset: VMDataset):
|
|
52
52
|
- The interpretability of the score can be complex as it depends on the understanding of information theory
|
53
53
|
concepts.
|
54
54
|
"""
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
55
|
+
ami_score = adjusted_mutual_info_score(
|
56
|
+
labels_true=dataset.y,
|
57
|
+
labels_pred=dataset.y_pred(model),
|
58
|
+
)
|
59
|
+
|
60
|
+
return [{"Adjusted Mutual Information": ami_score}], RawData(
|
61
|
+
ami_score=ami_score, model=model.input_id, dataset=dataset.input_id
|
62
|
+
)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
from sklearn.metrics import adjusted_rand_score
|
6
6
|
|
7
|
-
from validmind import tags, tasks
|
7
|
+
from validmind import RawData, tags, tasks
|
8
8
|
from validmind.vm_models import VMDataset, VMModel
|
9
9
|
|
10
10
|
|
@@ -49,11 +49,11 @@ def AdjustedRandIndex(model: VMModel, dataset: VMDataset):
|
|
49
49
|
- It may be difficult to interpret the implications of an ARI score without context or a benchmark, as it is
|
50
50
|
heavily dependent on the characteristics of the dataset used.
|
51
51
|
"""
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
52
|
+
ari = adjusted_rand_score(
|
53
|
+
labels_true=dataset.y,
|
54
|
+
labels_pred=dataset.y_pred(model),
|
55
|
+
)
|
56
|
+
|
57
|
+
return [{"Adjusted Rand Index": ari}], RawData(
|
58
|
+
ari_score=ari, model=model.input_id, dataset=dataset.input_id
|
59
|
+
)
|
@@ -72,7 +72,10 @@ def CalibrationCurve(model: VMModel, dataset: VMDataset, n_bins: int = 10):
|
|
72
72
|
|
73
73
|
# Create DataFrame for raw data
|
74
74
|
raw_data = RawData(
|
75
|
-
mean_predicted_probability=prob_pred,
|
75
|
+
mean_predicted_probability=prob_pred,
|
76
|
+
observed_frequency=prob_true,
|
77
|
+
model=model.input_id,
|
78
|
+
dataset=dataset.input_id,
|
76
79
|
)
|
77
80
|
|
78
81
|
# Create Plotly figure
|
@@ -114,4 +117,4 @@ def CalibrationCurve(model: VMModel, dataset: VMDataset, n_bins: int = 10):
|
|
114
117
|
template="plotly_white",
|
115
118
|
)
|
116
119
|
|
117
|
-
return
|
120
|
+
return fig, raw_data
|
@@ -2,17 +2,24 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
+
from typing import Dict, List, Optional, Union
|
6
|
+
|
5
7
|
import numpy as np
|
6
8
|
import pandas as pd
|
7
9
|
import plotly.graph_objects as go
|
8
10
|
from plotly.subplots import make_subplots
|
9
11
|
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve
|
10
12
|
|
11
|
-
from validmind import tags, tasks
|
13
|
+
from validmind import RawData, tags, tasks
|
12
14
|
from validmind.vm_models import VMDataset, VMModel
|
13
15
|
|
14
16
|
|
15
|
-
def find_optimal_threshold(
|
17
|
+
def find_optimal_threshold(
|
18
|
+
y_true: np.ndarray,
|
19
|
+
y_prob: np.ndarray,
|
20
|
+
method: str = "youden",
|
21
|
+
target_recall: Optional[float] = None,
|
22
|
+
) -> Dict[str, Union[str, float]]:
|
16
23
|
"""
|
17
24
|
Find the optimal classification threshold using various methods.
|
18
25
|
|
@@ -80,8 +87,11 @@ def find_optimal_threshold(y_true, y_prob, method="youden", target_recall=None):
|
|
80
87
|
@tags("model_validation", "threshold_optimization", "classification_metrics")
|
81
88
|
@tasks("classification")
|
82
89
|
def ClassifierThresholdOptimization(
|
83
|
-
dataset: VMDataset,
|
84
|
-
|
90
|
+
dataset: VMDataset,
|
91
|
+
model: VMModel,
|
92
|
+
methods: Optional[List[str]] = None,
|
93
|
+
target_recall: Optional[float] = None,
|
94
|
+
) -> Dict[str, Union[pd.DataFrame, go.Figure]]:
|
85
95
|
"""
|
86
96
|
Analyzes and visualizes different threshold optimization methods for binary classification models.
|
87
97
|
|
@@ -255,4 +265,17 @@ def ClassifierThresholdOptimization(
|
|
255
265
|
# Create results table and sort by threshold descending
|
256
266
|
table = pd.DataFrame(results).sort_values("threshold", ascending=False)
|
257
267
|
|
258
|
-
return
|
268
|
+
return (
|
269
|
+
fig,
|
270
|
+
table,
|
271
|
+
RawData(
|
272
|
+
fpr=fpr,
|
273
|
+
tpr=tpr,
|
274
|
+
precision=precision,
|
275
|
+
recall=recall,
|
276
|
+
thresholds_roc=thresholds_roc,
|
277
|
+
thresholds_pr=thresholds_pr,
|
278
|
+
model=model.input_id,
|
279
|
+
dataset=dataset.input_id,
|
280
|
+
),
|
281
|
+
)
|
@@ -84,4 +84,8 @@ def ClusterCosineSimilarity(model: VMModel, dataset: VMDataset):
|
|
84
84
|
if not table:
|
85
85
|
raise SkipTestError("No clusters found")
|
86
86
|
|
87
|
-
return table, RawData(
|
87
|
+
return table, RawData(
|
88
|
+
cluster_centroids=cluster_centroids,
|
89
|
+
model=model.input_id,
|
90
|
+
dataset=dataset.input_id,
|
91
|
+
)
|
@@ -11,7 +11,7 @@ from sklearn.metrics import (
|
|
11
11
|
v_measure_score,
|
12
12
|
)
|
13
13
|
|
14
|
-
from validmind import tags, tasks
|
14
|
+
from validmind import RawData, tags, tasks
|
15
15
|
from validmind.vm_models import VMDataset, VMModel
|
16
16
|
|
17
17
|
HOMOGENEITY = """
|
@@ -115,53 +115,63 @@ def ClusterPerformanceMetrics(model: VMModel, dataset: VMDataset):
|
|
115
115
|
- Does not consider aspects like computational efficiency of the model or its capability to handle high dimensional
|
116
116
|
data.
|
117
117
|
"""
|
118
|
-
|
118
|
+
y_true = dataset.y
|
119
|
+
y_pred = dataset.y_pred(model)
|
120
|
+
|
121
|
+
metrics = [
|
119
122
|
{
|
120
123
|
"Metric": "Homogeneity Score",
|
121
124
|
"Description": HOMOGENEITY,
|
122
125
|
"Value": homogeneity_score(
|
123
|
-
labels_true=
|
124
|
-
labels_pred=
|
126
|
+
labels_true=y_true,
|
127
|
+
labels_pred=y_pred,
|
125
128
|
),
|
126
129
|
},
|
127
130
|
{
|
128
131
|
"Metric": "Completeness Score",
|
129
132
|
"Description": COMPLETENESS,
|
130
133
|
"Value": completeness_score(
|
131
|
-
labels_true=
|
132
|
-
labels_pred=
|
134
|
+
labels_true=y_true,
|
135
|
+
labels_pred=y_pred,
|
133
136
|
),
|
134
137
|
},
|
135
138
|
{
|
136
139
|
"Metric": "V Measure",
|
137
140
|
"Description": V_MEASURE,
|
138
141
|
"Value": v_measure_score(
|
139
|
-
labels_true=
|
140
|
-
labels_pred=
|
142
|
+
labels_true=y_true,
|
143
|
+
labels_pred=y_pred,
|
141
144
|
),
|
142
145
|
},
|
143
146
|
{
|
144
147
|
"Metric": "Adjusted Rand Index",
|
145
148
|
"Description": ADJUSTED_RAND_INDEX,
|
146
149
|
"Value": adjusted_rand_score(
|
147
|
-
labels_true=
|
148
|
-
labels_pred=
|
150
|
+
labels_true=y_true,
|
151
|
+
labels_pred=y_pred,
|
149
152
|
),
|
150
153
|
},
|
151
154
|
{
|
152
155
|
"Metric": "Adjusted Mutual Information",
|
153
156
|
"Description": ADJUSTED_MUTUAL_INFORMATION,
|
154
157
|
"Value": adjusted_mutual_info_score(
|
155
|
-
labels_true=
|
156
|
-
labels_pred=
|
158
|
+
labels_true=y_true,
|
159
|
+
labels_pred=y_pred,
|
157
160
|
),
|
158
161
|
},
|
159
162
|
{
|
160
163
|
"Metric": "Fowlkes-Mallows score",
|
161
164
|
"Description": FOULKES_MALLOWS_SCORE,
|
162
165
|
"Value": fowlkes_mallows_score(
|
163
|
-
labels_true=
|
164
|
-
labels_pred=
|
166
|
+
labels_true=y_true,
|
167
|
+
labels_pred=y_pred,
|
165
168
|
),
|
166
169
|
},
|
167
170
|
]
|
171
|
+
|
172
|
+
return metrics, RawData(
|
173
|
+
true_labels=y_true,
|
174
|
+
predicted_labels=y_pred,
|
175
|
+
model=model.input_id,
|
176
|
+
dataset=dataset.input_id,
|
177
|
+
)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
from sklearn.metrics import completeness_score
|
6
6
|
|
7
|
-
from validmind import tags, tasks
|
7
|
+
from validmind import RawData, tags, tasks
|
8
8
|
from validmind.vm_models import VMDataset, VMModel
|
9
9
|
|
10
10
|
|
@@ -47,11 +47,10 @@ def CompletenessScore(model: VMModel, dataset: VMDataset):
|
|
47
47
|
- The Completeness Score only applies to clustering models; it cannot be used for other types of machine learning
|
48
48
|
models.
|
49
49
|
"""
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
]
|
50
|
+
score = completeness_score(
|
51
|
+
labels_true=dataset.y,
|
52
|
+
labels_pred=dataset.y_pred(model),
|
53
|
+
)
|
54
|
+
return [{"Completeness Score": score}], RawData(
|
55
|
+
score=score, model=model.input_id, dataset=dataset.input_id
|
56
|
+
)
|
@@ -19,7 +19,11 @@ from validmind.vm_models import VMDataset, VMModel
|
|
19
19
|
"visualization",
|
20
20
|
)
|
21
21
|
@tasks("classification", "text_classification")
|
22
|
-
def ConfusionMatrix(
|
22
|
+
def ConfusionMatrix(
|
23
|
+
dataset: VMDataset,
|
24
|
+
model: VMModel,
|
25
|
+
threshold: float = 0.5,
|
26
|
+
):
|
23
27
|
"""
|
24
28
|
Evaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix
|
25
29
|
heatmap.
|
@@ -66,7 +70,17 @@ def ConfusionMatrix(dataset: VMDataset, model: VMModel):
|
|
66
70
|
- Risks of misinterpretation exist because the matrix doesn't directly provide precision, recall, or F1-score data.
|
67
71
|
These metrics have to be computed separately.
|
68
72
|
"""
|
69
|
-
|
73
|
+
# Get predictions using threshold for binary classification if possible
|
74
|
+
if hasattr(model.model, "predict_proba"):
|
75
|
+
y_prob = dataset.y_prob(model)
|
76
|
+
# Handle both 1D and 2D probability arrays
|
77
|
+
if y_prob.ndim == 2:
|
78
|
+
y_pred = (y_prob[:, 1] > threshold).astype(int)
|
79
|
+
else:
|
80
|
+
y_pred = (y_prob > threshold).astype(int)
|
81
|
+
else:
|
82
|
+
y_pred = dataset.y_pred(model)
|
83
|
+
|
70
84
|
y_true = dataset.y.astype(y_pred.dtype)
|
71
85
|
|
72
86
|
labels = np.unique(y_true)
|
@@ -119,4 +133,9 @@ def ConfusionMatrix(dataset: VMDataset, model: VMModel):
|
|
119
133
|
font=dict(size=14),
|
120
134
|
)
|
121
135
|
|
122
|
-
return fig, RawData(
|
136
|
+
return fig, RawData(
|
137
|
+
confusion_matrix=cm,
|
138
|
+
threshold=threshold,
|
139
|
+
dataset=dataset.input_id,
|
140
|
+
model=model.input_id,
|
141
|
+
)
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import pandas as pd
|
6
6
|
from sklearn.inspection import permutation_importance
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
from validmind.vm_models import VMDataset, VMModel
|
10
10
|
|
11
11
|
|
@@ -78,4 +78,8 @@ def FeatureImportance(dataset: VMDataset, model: VMModel, num_features: int = 3)
|
|
78
78
|
else:
|
79
79
|
result[f"Feature {i + 1}"] = None
|
80
80
|
|
81
|
-
return pd.DataFrame([result])
|
81
|
+
return pd.DataFrame([result]), RawData(
|
82
|
+
permutation_importance=pfi_values,
|
83
|
+
model=model.input_id,
|
84
|
+
dataset=dataset.input_id,
|
85
|
+
)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
from sklearn import metrics
|
6
6
|
|
7
|
-
from validmind import tags, tasks
|
7
|
+
from validmind import RawData, tags, tasks
|
8
8
|
from validmind.vm_models import VMDataset, VMModel
|
9
9
|
|
10
10
|
|
@@ -52,11 +52,14 @@ def FowlkesMallowsScore(dataset: VMDataset, model: VMModel):
|
|
52
52
|
- It does not handle mismatching numbers of clusters between the true and predicted labels. As such, it may return
|
53
53
|
misleading results if the predicted labels suggest a different number of clusters than what is in the true labels.
|
54
54
|
"""
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
55
|
+
fowlkes_mallows_score = metrics.fowlkes_mallows_score(
|
56
|
+
labels_true=dataset.y,
|
57
|
+
labels_pred=dataset.y_pred(model),
|
58
|
+
)
|
59
|
+
|
60
|
+
return [{"Fowlkes-Mallows score": fowlkes_mallows_score}], RawData(
|
61
|
+
labels_true=dataset.y,
|
62
|
+
labels_pred=dataset.y_pred(model),
|
63
|
+
model=model.input_id,
|
64
|
+
dataset=dataset.input_id,
|
65
|
+
)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
from sklearn import metrics
|
6
6
|
|
7
|
-
from validmind import tags, tasks
|
7
|
+
from validmind import RawData, tags, tasks
|
8
8
|
from validmind.vm_models import VMDataset, VMModel
|
9
9
|
|
10
10
|
|
@@ -50,11 +50,16 @@ def HomogeneityScore(dataset: VMDataset, model: VMModel):
|
|
50
50
|
- The score does not address the actual number of clusters formed, or the evenness of cluster sizes. It only checks
|
51
51
|
the homogeneity within the given clusters created by the model.
|
52
52
|
"""
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
53
|
+
homogeneity_score = metrics.homogeneity_score(
|
54
|
+
labels_true=dataset.y,
|
55
|
+
labels_pred=dataset.y_pred(model),
|
56
|
+
)
|
57
|
+
|
58
|
+
raw_data = RawData(
|
59
|
+
y_true=dataset.y,
|
60
|
+
y_pred=dataset.y_pred(model),
|
61
|
+
model=model.input_id,
|
62
|
+
dataset=dataset.input_id,
|
63
|
+
)
|
64
|
+
|
65
|
+
return ([{"Homogeneity Score": homogeneity_score}], raw_data)
|
@@ -7,7 +7,7 @@ from typing import Dict, List, Union
|
|
7
7
|
from sklearn.metrics import make_scorer, recall_score
|
8
8
|
from sklearn.model_selection import GridSearchCV
|
9
9
|
|
10
|
-
from validmind import tags, tasks
|
10
|
+
from validmind import RawData, tags, tasks
|
11
11
|
from validmind.vm_models import VMDataset, VMModel
|
12
12
|
|
13
13
|
|
@@ -162,4 +162,6 @@ def HyperParametersTuning(
|
|
162
162
|
|
163
163
|
results.append(row_result)
|
164
164
|
|
165
|
-
return results
|
165
|
+
return results, RawData(
|
166
|
+
model=model.input_id, dataset=dataset.input_id, param_grid=param_grid
|
167
|
+
)
|
@@ -124,4 +124,9 @@ def KMeansClustersOptimization(
|
|
124
124
|
|
125
125
|
fig.update_layout(showlegend=False)
|
126
126
|
|
127
|
-
return fig, RawData(
|
127
|
+
return fig, RawData(
|
128
|
+
distortions=distortions,
|
129
|
+
silhouette_avg=silhouette_avg,
|
130
|
+
model=model.input_id,
|
131
|
+
dataset=dataset.input_id,
|
132
|
+
)
|
@@ -3,6 +3,7 @@
|
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
from sklearn.metrics import accuracy_score
|
5
5
|
|
6
|
+
from validmind import RawData
|
6
7
|
from validmind.tests import tags, tasks
|
7
8
|
from validmind.vm_models import VMDataset, VMModel
|
8
9
|
|
@@ -50,10 +51,14 @@ def MinimumAccuracy(dataset: VMDataset, model: VMModel, min_threshold: float = 0
|
|
50
51
|
"""
|
51
52
|
accuracy = accuracy_score(dataset.y, dataset.y_pred(model))
|
52
53
|
|
53
|
-
return
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
54
|
+
return (
|
55
|
+
[
|
56
|
+
{
|
57
|
+
"Score": accuracy,
|
58
|
+
"Threshold": min_threshold,
|
59
|
+
"Pass/Fail": "Pass" if accuracy > min_threshold else "Fail",
|
60
|
+
}
|
61
|
+
],
|
62
|
+
accuracy > min_threshold,
|
63
|
+
RawData(model=model.input_id, dataset=dataset.input_id),
|
64
|
+
)
|
@@ -5,6 +5,7 @@
|
|
5
5
|
import numpy as np
|
6
6
|
from sklearn.metrics import f1_score
|
7
7
|
|
8
|
+
from validmind import RawData
|
8
9
|
from validmind.tests import tags, tasks
|
9
10
|
from validmind.vm_models import VMDataset, VMModel
|
10
11
|
|
@@ -58,10 +59,14 @@ def MinimumF1Score(dataset: VMDataset, model: VMModel, min_threshold: float = 0.
|
|
58
59
|
else:
|
59
60
|
score = f1_score(dataset.y, dataset.y_pred(model))
|
60
61
|
|
61
|
-
return
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
62
|
+
return (
|
63
|
+
[
|
64
|
+
{
|
65
|
+
"Score": score,
|
66
|
+
"Threshold": min_threshold,
|
67
|
+
"Pass/Fail": "Pass" if score > min_threshold else "Fail",
|
68
|
+
}
|
69
|
+
],
|
70
|
+
score > min_threshold,
|
71
|
+
RawData(score=score, model=model.input_id, dataset=dataset.input_id),
|
72
|
+
)
|