validmind 2.8.29__py3-none-any.whl → 2.10.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +16 -5
- validmind/__version__.py +1 -1
- validmind/ai/utils.py +4 -24
- validmind/api_client.py +6 -17
- validmind/datasets/credit_risk/lending_club.py +13 -1
- validmind/datasets/nlp/cnn_dailymail.py +15 -1
- validmind/logging.py +48 -0
- validmind/tests/__init__.py +2 -0
- validmind/tests/__types__.py +18 -0
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +14 -2
- validmind/tests/data_validation/DickeyFullerGLS.py +13 -2
- validmind/tests/data_validation/PhillipsPerronArch.py +13 -2
- validmind/tests/data_validation/SeasonalDecompose.py +14 -2
- validmind/tests/data_validation/ShapiroWilk.py +14 -1
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +14 -1
- validmind/tests/data_validation/WOEBinPlots.py +14 -1
- validmind/tests/data_validation/WOEBinTable.py +13 -2
- validmind/tests/data_validation/ZivotAndrewsArch.py +13 -2
- validmind/tests/data_validation/nlp/CommonWords.py +14 -2
- validmind/tests/data_validation/nlp/LanguageDetection.py +14 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +13 -1
- validmind/tests/data_validation/nlp/Sentiment.py +13 -1
- validmind/tests/data_validation/nlp/StopWords.py +14 -2
- validmind/tests/data_validation/nlp/TextDescription.py +14 -2
- validmind/tests/data_validation/nlp/Toxicity.py +13 -1
- validmind/tests/model_validation/BertScore.py +13 -2
- validmind/tests/model_validation/BleuScore.py +13 -2
- validmind/tests/model_validation/ContextualRecall.py +13 -1
- validmind/tests/model_validation/MeteorScore.py +13 -2
- validmind/tests/model_validation/ModelPredictionResiduals.py +14 -1
- validmind/tests/model_validation/RegardScore.py +13 -2
- validmind/tests/model_validation/RougeScore.py +14 -1
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +14 -1
- validmind/tests/model_validation/ToxicityScore.py +13 -1
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +14 -2
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +13 -2
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +14 -2
- validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py +14 -1
- validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +14 -1
- validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py +14 -1
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +14 -1
- validmind/tests/output.py +9 -2
- validmind/tests/plots/BoxPlot.py +260 -0
- validmind/tests/plots/CorrelationHeatmap.py +235 -0
- validmind/tests/plots/HistogramPlot.py +233 -0
- validmind/tests/plots/ViolinPlot.py +125 -0
- validmind/tests/plots/__init__.py +0 -0
- validmind/tests/stats/CorrelationAnalysis.py +251 -0
- validmind/tests/stats/DescriptiveStats.py +197 -0
- validmind/tests/stats/NormalityTests.py +147 -0
- validmind/tests/stats/OutlierDetection.py +173 -0
- validmind/tests/stats/__init__.py +0 -0
- validmind/unit_metrics/classification/individual/AbsoluteError.py +42 -0
- validmind/unit_metrics/classification/individual/BrierScore.py +56 -0
- validmind/unit_metrics/classification/individual/CalibrationError.py +77 -0
- validmind/unit_metrics/classification/individual/ClassBalance.py +65 -0
- validmind/unit_metrics/classification/individual/Confidence.py +52 -0
- validmind/unit_metrics/classification/individual/Correctness.py +41 -0
- validmind/unit_metrics/classification/individual/LogLoss.py +61 -0
- validmind/unit_metrics/classification/individual/OutlierScore.py +86 -0
- validmind/unit_metrics/classification/individual/ProbabilityError.py +54 -0
- validmind/unit_metrics/classification/individual/Uncertainty.py +60 -0
- validmind/unit_metrics/classification/individual/__init__.py +0 -0
- validmind/vm_models/dataset/dataset.py +147 -1
- validmind/vm_models/result/result.py +30 -6
- validmind-2.10.0rc1.dist-info/METADATA +845 -0
- {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/RECORD +70 -49
- validmind-2.8.29.dist-info/METADATA +0 -137
- {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/LICENSE +0 -0
- {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/WHEEL +0 -0
- {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import List
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.vm_models import VMDataset, VMModel
|
11
|
+
|
12
|
+
|
13
|
+
@tasks("classification")
|
14
|
+
@tags("classification")
|
15
|
+
def ClassBalance(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
|
16
|
+
"""Calculates the class balance score per row for a classification model.
|
17
|
+
|
18
|
+
For each prediction, this returns how balanced the predicted class is in the
|
19
|
+
training distribution. Lower scores indicate predictions on rare classes,
|
20
|
+
higher scores indicate predictions on common classes. This helps understand
|
21
|
+
if model errors are more likely on imbalanced classes.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
model: The classification model to evaluate
|
25
|
+
dataset: The dataset containing true labels and predictions
|
26
|
+
**kwargs: Additional parameters (unused for compatibility)
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
List[float]: Per-row class balance scores as a list of float values
|
30
|
+
|
31
|
+
Note:
|
32
|
+
Scores range from 0 to 0.5, where 0.5 indicates perfectly balanced classes
|
33
|
+
and lower values indicate more imbalanced classes.
|
34
|
+
"""
|
35
|
+
y_true = dataset.y
|
36
|
+
y_pred = dataset.y_pred(model)
|
37
|
+
|
38
|
+
# Convert to numpy arrays
|
39
|
+
y_true = np.asarray(y_true)
|
40
|
+
y_pred = np.asarray(y_pred)
|
41
|
+
|
42
|
+
# Calculate class frequencies in the true labels (proxy for training distribution)
|
43
|
+
unique_classes, class_counts = np.unique(y_true, return_counts=True)
|
44
|
+
class_frequencies = class_counts / len(y_true)
|
45
|
+
|
46
|
+
# Create a mapping from class to frequency
|
47
|
+
class_to_freq = dict(zip(unique_classes, class_frequencies))
|
48
|
+
|
49
|
+
# Calculate balance score for each prediction
|
50
|
+
balance_scores = []
|
51
|
+
|
52
|
+
for pred in y_pred:
|
53
|
+
if pred in class_to_freq:
|
54
|
+
freq = class_to_freq[pred]
|
55
|
+
# Balance score: how close to 0.5 (perfectly balanced) the frequency is
|
56
|
+
# Score = 0.5 - |freq - 0.5| = min(freq, 1-freq)
|
57
|
+
balance_score = min(freq, 1 - freq)
|
58
|
+
else:
|
59
|
+
# Predicted class not seen in true labels (very rare)
|
60
|
+
balance_score = 0.0
|
61
|
+
|
62
|
+
balance_scores.append(balance_score)
|
63
|
+
|
64
|
+
# Return as a list of floats
|
65
|
+
return balance_scores
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import List
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.vm_models import VMDataset, VMModel
|
11
|
+
|
12
|
+
|
13
|
+
@tasks("classification")
|
14
|
+
@tags("classification")
|
15
|
+
def Confidence(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
|
16
|
+
"""Calculates the prediction confidence per row for a classification model.
|
17
|
+
|
18
|
+
For binary classification, confidence is calculated as the maximum probability
|
19
|
+
across classes, or alternatively as the distance from the decision boundary (0.5).
|
20
|
+
Higher values indicate more confident predictions.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
model: The classification model to evaluate
|
24
|
+
dataset: The dataset containing true labels and predicted probabilities
|
25
|
+
**kwargs: Additional parameters (unused for compatibility)
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
List[float]: Per-row confidence scores as a list of float values
|
29
|
+
|
30
|
+
Raises:
|
31
|
+
ValueError: If probability column is not found for the model
|
32
|
+
"""
|
33
|
+
# Try to get probabilities, fall back to predictions if not available
|
34
|
+
try:
|
35
|
+
y_prob = dataset.y_prob(model)
|
36
|
+
# For binary classification, use max probability approach
|
37
|
+
if y_prob.ndim > 1 and y_prob.shape[1] > 1:
|
38
|
+
# Multi-class: confidence is the maximum probability
|
39
|
+
confidence = np.max(y_prob, axis=1)
|
40
|
+
else:
|
41
|
+
# Binary classification: confidence based on distance from 0.5
|
42
|
+
y_prob = np.asarray(y_prob, dtype=float)
|
43
|
+
confidence = np.abs(y_prob - 0.5) + 0.5
|
44
|
+
except ValueError:
|
45
|
+
# Fall back to binary correctness if probabilities not available
|
46
|
+
y_true = dataset.y
|
47
|
+
y_pred = dataset.y_pred(model)
|
48
|
+
# If no probabilities, confidence is 1.0 for correct, 0.0 for incorrect
|
49
|
+
confidence = (y_true == y_pred).astype(float)
|
50
|
+
|
51
|
+
# Return as a list of floats
|
52
|
+
return confidence.tolist()
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import List
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.vm_models import VMDataset, VMModel
|
11
|
+
|
12
|
+
|
13
|
+
@tasks("classification")
|
14
|
+
@tags("classification")
|
15
|
+
def Correctness(model: VMModel, dataset: VMDataset, **kwargs) -> List[int]:
|
16
|
+
"""Calculates the correctness per row for a classification model.
|
17
|
+
|
18
|
+
For classification tasks, this returns 1 for correctly classified rows
|
19
|
+
and 0 for incorrectly classified rows. This provides a binary indicator
|
20
|
+
of model performance for each individual prediction.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
model: The classification model to evaluate
|
24
|
+
dataset: The dataset containing true labels and predictions
|
25
|
+
**kwargs: Additional parameters (unused for compatibility)
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
List[int]: Per-row correctness as a list of 1s and 0s
|
29
|
+
"""
|
30
|
+
y_true = dataset.y
|
31
|
+
y_pred = dataset.y_pred(model)
|
32
|
+
|
33
|
+
# Convert to numpy arrays
|
34
|
+
y_true = np.asarray(y_true)
|
35
|
+
y_pred = np.asarray(y_pred)
|
36
|
+
|
37
|
+
# For classification, check if predictions match true labels
|
38
|
+
correctness = (y_true == y_pred).astype(int)
|
39
|
+
|
40
|
+
# Return as a list of integers
|
41
|
+
return correctness.tolist()
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import List
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.vm_models import VMDataset, VMModel
|
11
|
+
|
12
|
+
|
13
|
+
@tasks("classification")
|
14
|
+
@tags("classification")
|
15
|
+
def LogLoss(
|
16
|
+
model: VMModel, dataset: VMDataset, eps: float = 1e-15, **kwargs
|
17
|
+
) -> List[float]:
|
18
|
+
"""Calculates the logarithmic loss per row for a classification model.
|
19
|
+
|
20
|
+
Log loss measures the performance of a classification model where the prediction
|
21
|
+
is a probability value between 0 and 1. The log loss increases as the predicted
|
22
|
+
probability diverges from the actual label.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
model: The classification model to evaluate
|
26
|
+
dataset: The dataset containing true labels and predicted probabilities
|
27
|
+
eps: Small value to avoid log(0), defaults to 1e-15
|
28
|
+
**kwargs: Additional parameters (unused for compatibility)
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
List[float]: Per-row log loss values as a list of float values
|
32
|
+
|
33
|
+
Raises:
|
34
|
+
ValueError: If probability column is not found for the model
|
35
|
+
"""
|
36
|
+
y_true = dataset.y
|
37
|
+
|
38
|
+
# Try to get probabilities
|
39
|
+
try:
|
40
|
+
y_prob = dataset.y_prob(model)
|
41
|
+
# For binary classification, use the positive class probability
|
42
|
+
if y_prob.ndim > 1 and y_prob.shape[1] > 1:
|
43
|
+
y_prob = y_prob[:, 1] # Use probability of positive class
|
44
|
+
except ValueError:
|
45
|
+
# Fall back to predictions if probabilities not available
|
46
|
+
# Convert predictions to "probabilities" (0.99 for correct class, 0.01 for wrong)
|
47
|
+
y_pred = dataset.y_pred(model)
|
48
|
+
y_prob = np.where(y_true == y_pred, 0.99, 0.01)
|
49
|
+
|
50
|
+
# Convert to numpy arrays and ensure same data type
|
51
|
+
y_true = np.asarray(y_true, dtype=float)
|
52
|
+
y_prob = np.asarray(y_prob, dtype=float)
|
53
|
+
|
54
|
+
# Clip probabilities to avoid log(0) and log(1)
|
55
|
+
y_prob = np.clip(y_prob, eps, 1 - eps)
|
56
|
+
|
57
|
+
# Calculate log loss per row: -[y*log(p) + (1-y)*log(1-p)]
|
58
|
+
log_loss_per_row = -(y_true * np.log(y_prob) + (1 - y_true) * np.log(1 - y_prob))
|
59
|
+
|
60
|
+
# Return as a list of floats
|
61
|
+
return log_loss_per_row.tolist()
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import List
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
from sklearn.ensemble import IsolationForest
|
9
|
+
from sklearn.preprocessing import StandardScaler
|
10
|
+
|
11
|
+
from validmind import tags, tasks
|
12
|
+
from validmind.vm_models import VMDataset, VMModel
|
13
|
+
|
14
|
+
|
15
|
+
@tasks("classification")
|
16
|
+
@tags("classification")
|
17
|
+
def OutlierScore(
|
18
|
+
model: VMModel, dataset: VMDataset, contamination: float = 0.1, **kwargs
|
19
|
+
) -> List[float]:
|
20
|
+
"""Calculates the outlier score per row for a classification model.
|
21
|
+
|
22
|
+
Uses Isolation Forest to identify samples that deviate significantly from
|
23
|
+
the typical patterns in the feature space. Higher scores indicate more
|
24
|
+
anomalous/outlier-like samples. This can help identify out-of-distribution
|
25
|
+
samples or data points that might be harder to predict accurately.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
model: The classification model to evaluate (unused but kept for consistency)
|
29
|
+
dataset: The dataset containing feature data
|
30
|
+
contamination: Expected proportion of outliers, defaults to 0.1
|
31
|
+
**kwargs: Additional parameters (unused for compatibility)
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
List[float]: Per-row outlier scores as a list of float values
|
35
|
+
|
36
|
+
Note:
|
37
|
+
Scores are normalized to [0, 1] where higher values indicate more outlier-like samples
|
38
|
+
"""
|
39
|
+
# Get feature data
|
40
|
+
X = dataset.x_df()
|
41
|
+
|
42
|
+
# Handle case where we have no features or only categorical features
|
43
|
+
if X.empty or X.shape[1] == 0:
|
44
|
+
# Return zero outlier scores if no features available
|
45
|
+
return [0.0] * len(dataset.y)
|
46
|
+
|
47
|
+
# Select only numeric features for outlier detection
|
48
|
+
numeric_features = dataset.feature_columns_numeric
|
49
|
+
if not numeric_features:
|
50
|
+
# If no numeric features, return zero outlier scores
|
51
|
+
return [0.0] * len(dataset.y)
|
52
|
+
|
53
|
+
X_numeric = X[numeric_features]
|
54
|
+
|
55
|
+
# Handle missing values by filling with median
|
56
|
+
X_filled = X_numeric.fillna(X_numeric.median())
|
57
|
+
|
58
|
+
# Standardize features for better outlier detection
|
59
|
+
scaler = StandardScaler()
|
60
|
+
X_scaled = scaler.fit_transform(X_filled)
|
61
|
+
|
62
|
+
# Fit Isolation Forest
|
63
|
+
isolation_forest = IsolationForest(
|
64
|
+
contamination=contamination, random_state=42, n_estimators=100
|
65
|
+
)
|
66
|
+
|
67
|
+
# Fit the model on the data
|
68
|
+
isolation_forest.fit(X_scaled)
|
69
|
+
|
70
|
+
# Get anomaly scores (negative values for outliers)
|
71
|
+
anomaly_scores = isolation_forest.decision_function(X_scaled)
|
72
|
+
|
73
|
+
# Convert to outlier scores (0 to 1, where 1 is most outlier-like)
|
74
|
+
# Normalize using min-max scaling
|
75
|
+
min_score = np.min(anomaly_scores)
|
76
|
+
max_score = np.max(anomaly_scores)
|
77
|
+
|
78
|
+
if max_score == min_score:
|
79
|
+
# All samples have same score, no outliers detected
|
80
|
+
outlier_scores = np.zeros_like(anomaly_scores)
|
81
|
+
else:
|
82
|
+
# Invert and normalize: higher values = more outlier-like
|
83
|
+
outlier_scores = (max_score - anomaly_scores) / (max_score - min_score)
|
84
|
+
|
85
|
+
# Return as a list of floats
|
86
|
+
return outlier_scores.tolist()
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import List
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.vm_models import VMDataset, VMModel
|
11
|
+
|
12
|
+
|
13
|
+
@tasks("classification")
|
14
|
+
@tags("classification")
|
15
|
+
def ProbabilityError(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
|
16
|
+
"""Calculates the probability error per row for a classification model.
|
17
|
+
|
18
|
+
For binary classification tasks, this computes the absolute difference between
|
19
|
+
the true class labels (0 or 1) and the predicted probabilities for each row.
|
20
|
+
This provides insight into how confident the model's predictions are and
|
21
|
+
how far off they are from the actual labels.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
model: The classification model to evaluate
|
25
|
+
dataset: The dataset containing true labels and predicted probabilities
|
26
|
+
**kwargs: Additional parameters (unused for compatibility)
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
List[float]: Per-row probability errors as a list of float values
|
30
|
+
|
31
|
+
Raises:
|
32
|
+
ValueError: If probability column is not found for the model
|
33
|
+
"""
|
34
|
+
y_true = dataset.y
|
35
|
+
|
36
|
+
# Try to get probabilities, fall back to predictions if not available
|
37
|
+
try:
|
38
|
+
y_prob = dataset.y_prob(model)
|
39
|
+
# For binary classification, use the positive class probability
|
40
|
+
if y_prob.ndim > 1 and y_prob.shape[1] > 1:
|
41
|
+
y_prob = y_prob[:, 1] # Use probability of positive class
|
42
|
+
except ValueError:
|
43
|
+
# Fall back to predictions if probabilities not available
|
44
|
+
y_prob = dataset.y_pred(model)
|
45
|
+
|
46
|
+
# Convert to numpy arrays and ensure same data type
|
47
|
+
y_true = np.asarray(y_true, dtype=float)
|
48
|
+
y_prob = np.asarray(y_prob, dtype=float)
|
49
|
+
|
50
|
+
# Compute absolute difference between true labels and predicted probabilities
|
51
|
+
probability_errors = np.abs(y_true - y_prob)
|
52
|
+
|
53
|
+
# Return as a list of floats
|
54
|
+
return probability_errors.tolist()
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from typing import List
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.vm_models import VMDataset, VMModel
|
11
|
+
|
12
|
+
|
13
|
+
@tasks("classification")
|
14
|
+
@tags("classification")
|
15
|
+
def Uncertainty(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
|
16
|
+
"""Calculates the prediction uncertainty per row for a classification model.
|
17
|
+
|
18
|
+
Uncertainty is measured using the entropy of the predicted probability distribution.
|
19
|
+
Higher entropy indicates higher uncertainty in the prediction. For binary
|
20
|
+
classification, maximum uncertainty occurs at probability 0.5.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
model: The classification model to evaluate
|
24
|
+
dataset: The dataset containing true labels and predicted probabilities
|
25
|
+
**kwargs: Additional parameters (unused for compatibility)
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
List[float]: Per-row uncertainty scores as a list of float values
|
29
|
+
|
30
|
+
Raises:
|
31
|
+
ValueError: If probability column is not found for the model
|
32
|
+
"""
|
33
|
+
# Try to get probabilities
|
34
|
+
try:
|
35
|
+
y_prob = dataset.y_prob(model)
|
36
|
+
|
37
|
+
if y_prob.ndim > 1 and y_prob.shape[1] > 1:
|
38
|
+
# Multi-class: calculate entropy across all classes
|
39
|
+
# Clip to avoid log(0)
|
40
|
+
y_prob_clipped = np.clip(y_prob, 1e-15, 1 - 1e-15)
|
41
|
+
# Entropy: -sum(p * log(p))
|
42
|
+
uncertainty = -np.sum(y_prob_clipped * np.log(y_prob_clipped), axis=1)
|
43
|
+
else:
|
44
|
+
# Binary classification: calculate binary entropy
|
45
|
+
y_prob = np.asarray(y_prob, dtype=float)
|
46
|
+
# Clip to avoid log(0)
|
47
|
+
y_prob_clipped = np.clip(y_prob, 1e-15, 1 - 1e-15)
|
48
|
+
# Binary entropy: -[p*log(p) + (1-p)*log(1-p)]
|
49
|
+
uncertainty = -(
|
50
|
+
y_prob_clipped * np.log(y_prob_clipped)
|
51
|
+
+ (1 - y_prob_clipped) * np.log(1 - y_prob_clipped)
|
52
|
+
)
|
53
|
+
|
54
|
+
except ValueError:
|
55
|
+
# If no probabilities available, assume zero uncertainty for hard predictions
|
56
|
+
n_samples = len(dataset.y)
|
57
|
+
uncertainty = np.zeros(n_samples)
|
58
|
+
|
59
|
+
# Return as a list of floats
|
60
|
+
return uncertainty.tolist()
|
File without changes
|
@@ -8,7 +8,7 @@ Dataset class wrapper
|
|
8
8
|
|
9
9
|
import warnings
|
10
10
|
from copy import deepcopy
|
11
|
-
from typing import Any, Dict, Optional
|
11
|
+
from typing import Any, Dict, List, Optional, Union
|
12
12
|
|
13
13
|
import numpy as np
|
14
14
|
import pandas as pd
|
@@ -458,6 +458,152 @@ class VMDataset(VMInput):
|
|
458
458
|
|
459
459
|
return self.extra_columns.probability_column(model, column_name)
|
460
460
|
|
461
|
+
def assign_scores(
|
462
|
+
self,
|
463
|
+
model: VMModel,
|
464
|
+
metrics: Union[str, List[str]],
|
465
|
+
**kwargs: Dict[str, Any],
|
466
|
+
) -> None:
|
467
|
+
"""Assign computed unit metric scores to the dataset as new columns.
|
468
|
+
|
469
|
+
This method computes unit metrics for the given model and dataset, then adds
|
470
|
+
the computed scores as new columns to the dataset using the naming convention:
|
471
|
+
{model.input_id}_{metric_name}
|
472
|
+
|
473
|
+
Args:
|
474
|
+
model (VMModel): The model used to compute the scores.
|
475
|
+
metrics (Union[str, List[str]]): Single metric ID or list of metric IDs.
|
476
|
+
Can be either:
|
477
|
+
- Short name (e.g., "F1", "Precision")
|
478
|
+
- Full metric ID (e.g., "validmind.unit_metrics.classification.F1")
|
479
|
+
**kwargs: Additional parameters passed to the unit metrics.
|
480
|
+
|
481
|
+
Examples:
|
482
|
+
# Single metric
|
483
|
+
dataset.assign_scores(model, "F1")
|
484
|
+
|
485
|
+
# Multiple metrics
|
486
|
+
dataset.assign_scores(model, ["F1", "Precision", "Recall"])
|
487
|
+
|
488
|
+
# With parameters
|
489
|
+
dataset.assign_scores(model, "ROC_AUC", average="weighted")
|
490
|
+
|
491
|
+
Raises:
|
492
|
+
ValueError: If the model input_id is None or if metric computation fails.
|
493
|
+
ImportError: If unit_metrics module cannot be imported.
|
494
|
+
"""
|
495
|
+
if model.input_id is None:
|
496
|
+
raise ValueError("Model input_id must be set to use assign_scores")
|
497
|
+
|
498
|
+
# Import unit_metrics module
|
499
|
+
try:
|
500
|
+
from validmind.unit_metrics import run_metric
|
501
|
+
except ImportError as e:
|
502
|
+
raise ImportError(
|
503
|
+
f"Failed to import unit_metrics module: {e}. "
|
504
|
+
"Make sure validmind.unit_metrics is available."
|
505
|
+
) from e
|
506
|
+
|
507
|
+
# Normalize metrics to a list
|
508
|
+
if isinstance(metrics, str):
|
509
|
+
metrics = [metrics]
|
510
|
+
|
511
|
+
# Process each metric
|
512
|
+
for metric in metrics:
|
513
|
+
# Normalize metric ID
|
514
|
+
metric_id = self._normalize_metric_id(metric)
|
515
|
+
|
516
|
+
# Extract metric name for column naming
|
517
|
+
metric_name = self._extract_metric_name(metric_id)
|
518
|
+
|
519
|
+
# Generate column name
|
520
|
+
column_name = f"{model.input_id}_{metric_name}"
|
521
|
+
|
522
|
+
try:
|
523
|
+
# Run the unit metric
|
524
|
+
result = run_metric(
|
525
|
+
metric_id,
|
526
|
+
inputs={
|
527
|
+
"model": model,
|
528
|
+
"dataset": self,
|
529
|
+
},
|
530
|
+
params=kwargs,
|
531
|
+
show=False, # Don't show widget output
|
532
|
+
)
|
533
|
+
|
534
|
+
# Extract the metric value
|
535
|
+
metric_value = result.metric
|
536
|
+
|
537
|
+
# Create column values (repeat the scalar value for all rows)
|
538
|
+
if np.isscalar(metric_value):
|
539
|
+
column_values = np.full(len(self._df), metric_value)
|
540
|
+
else:
|
541
|
+
if len(metric_value) != len(self._df):
|
542
|
+
raise ValueError(
|
543
|
+
f"Metric value length {len(metric_value)} does not match dataset length {len(self._df)}"
|
544
|
+
)
|
545
|
+
column_values = metric_value
|
546
|
+
|
547
|
+
# Add the column to the dataset
|
548
|
+
self.add_extra_column(column_name, column_values)
|
549
|
+
|
550
|
+
logger.info(f"Added metric column '{column_name}'")
|
551
|
+
except Exception as e:
|
552
|
+
logger.error(f"Failed to compute metric {metric_id}: {e}")
|
553
|
+
raise ValueError(f"Failed to compute metric {metric_id}: {e}") from e
|
554
|
+
|
555
|
+
def _normalize_metric_id(self, metric: str) -> str:
|
556
|
+
"""Normalize metric identifier to full validmind unit metric ID.
|
557
|
+
|
558
|
+
Args:
|
559
|
+
metric (str): Metric identifier (short name or full ID)
|
560
|
+
|
561
|
+
Returns:
|
562
|
+
str: Full metric ID
|
563
|
+
"""
|
564
|
+
# If already a full ID, return as-is
|
565
|
+
if metric.startswith("validmind.unit_metrics."):
|
566
|
+
return metric
|
567
|
+
|
568
|
+
# Try to find the metric by short name
|
569
|
+
try:
|
570
|
+
from validmind.unit_metrics import list_metrics
|
571
|
+
|
572
|
+
available_metrics = list_metrics()
|
573
|
+
|
574
|
+
# Look for exact match with short name
|
575
|
+
for metric_id in available_metrics:
|
576
|
+
if metric_id.endswith(f".{metric}"):
|
577
|
+
return metric_id
|
578
|
+
|
579
|
+
# If no exact match found, raise error with suggestions
|
580
|
+
suggestions = [m for m in available_metrics if metric.lower() in m.lower()]
|
581
|
+
if suggestions:
|
582
|
+
raise ValueError(
|
583
|
+
f"Metric '{metric}' not found. Did you mean one of: {suggestions[:5]}"
|
584
|
+
)
|
585
|
+
else:
|
586
|
+
raise ValueError(
|
587
|
+
f"Metric '{metric}' not found. Available metrics: {available_metrics[:10]}..."
|
588
|
+
)
|
589
|
+
|
590
|
+
except ImportError as e:
|
591
|
+
raise ImportError(
|
592
|
+
f"Failed to import unit_metrics for metric lookup: {e}"
|
593
|
+
) from e
|
594
|
+
|
595
|
+
def _extract_metric_name(self, metric_id: str) -> str:
|
596
|
+
"""Extract the metric name from a full metric ID.
|
597
|
+
|
598
|
+
Args:
|
599
|
+
metric_id (str): Full metric ID
|
600
|
+
|
601
|
+
Returns:
|
602
|
+
str: Metric name
|
603
|
+
"""
|
604
|
+
# Extract the last part after the final dot
|
605
|
+
return metric_id.split(".")[-1]
|
606
|
+
|
461
607
|
def add_extra_column(self, column_name, column_values=None):
|
462
608
|
"""Adds an extra column to the dataset without modifying the dataset `features` and `target` columns.
|
463
609
|
|
@@ -7,6 +7,7 @@ Result objects for test results
|
|
7
7
|
"""
|
8
8
|
import asyncio
|
9
9
|
import json
|
10
|
+
import os
|
10
11
|
from abc import abstractmethod
|
11
12
|
from dataclasses import dataclass
|
12
13
|
from typing import Any, Dict, List, Optional, Union
|
@@ -20,7 +21,7 @@ from ipywidgets import HTML, VBox
|
|
20
21
|
from ... import api_client
|
21
22
|
from ...ai.utils import DescriptionFuture
|
22
23
|
from ...errors import InvalidParameterError
|
23
|
-
from ...logging import get_logger
|
24
|
+
from ...logging import get_logger, log_api_operation
|
24
25
|
from ...utils import (
|
25
26
|
HumanReadableEncoder,
|
26
27
|
NumpyEncoder,
|
@@ -177,7 +178,7 @@ class TestResult(Result):
|
|
177
178
|
title: Optional[str] = None
|
178
179
|
doc: Optional[str] = None
|
179
180
|
description: Optional[Union[str, DescriptionFuture]] = None
|
180
|
-
metric: Optional[Union[int, float]] = None
|
181
|
+
metric: Optional[Union[int, float, List[Union[int, float]]]] = None
|
181
182
|
tables: Optional[List[ResultTable]] = None
|
182
183
|
raw_data: Optional[RawData] = None
|
183
184
|
figures: Optional[List[Figure]] = None
|
@@ -464,8 +465,10 @@ class TestResult(Result):
|
|
464
465
|
)
|
465
466
|
)
|
466
467
|
|
467
|
-
|
468
|
-
|
468
|
+
# Only log unit metrics when the metric is a scalar value.
|
469
|
+
# Some tests may assign a list/array of per-row metrics to `self.metric`.
|
470
|
+
# Those should not be sent to the unit-metric endpoint which expects scalars.
|
471
|
+
if self.metric is not None and not hasattr(self.metric, "__len__"):
|
469
472
|
tasks.append(
|
470
473
|
api_client.alog_metric(
|
471
474
|
key=self.result_id,
|
@@ -476,9 +479,30 @@ class TestResult(Result):
|
|
476
479
|
)
|
477
480
|
|
478
481
|
if self.figures:
|
479
|
-
|
480
|
-
|
482
|
+
batch_size = min(
|
483
|
+
len(self.figures), int(os.getenv("VM_FIGURE_MAX_BATCH_SIZE", 20))
|
481
484
|
)
|
485
|
+
figure_batches = [
|
486
|
+
self.figures[i : i + batch_size]
|
487
|
+
for i in range(0, len(self.figures), batch_size)
|
488
|
+
]
|
489
|
+
|
490
|
+
async def upload_figures_in_batches():
|
491
|
+
for batch in figure_batches:
|
492
|
+
|
493
|
+
@log_api_operation(
|
494
|
+
operation_name=f"Uploading batch of {len(batch)} figures"
|
495
|
+
)
|
496
|
+
async def process_batch():
|
497
|
+
batch_tasks = [
|
498
|
+
api_client.alog_figure(figure) for figure in batch
|
499
|
+
]
|
500
|
+
return await asyncio.gather(*batch_tasks)
|
501
|
+
|
502
|
+
await process_batch()
|
503
|
+
|
504
|
+
tasks.append(upload_figures_in_batches())
|
505
|
+
|
482
506
|
if self.description:
|
483
507
|
revision_name = (
|
484
508
|
AI_REVISION_NAME
|