validmind 2.8.29__py3-none-any.whl → 2.10.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. validmind/__init__.py +16 -5
  2. validmind/__version__.py +1 -1
  3. validmind/ai/utils.py +4 -24
  4. validmind/api_client.py +6 -17
  5. validmind/datasets/credit_risk/lending_club.py +13 -1
  6. validmind/datasets/nlp/cnn_dailymail.py +15 -1
  7. validmind/logging.py +48 -0
  8. validmind/tests/__init__.py +2 -0
  9. validmind/tests/__types__.py +18 -0
  10. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +14 -2
  11. validmind/tests/data_validation/DickeyFullerGLS.py +13 -2
  12. validmind/tests/data_validation/PhillipsPerronArch.py +13 -2
  13. validmind/tests/data_validation/SeasonalDecompose.py +14 -2
  14. validmind/tests/data_validation/ShapiroWilk.py +14 -1
  15. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +14 -1
  16. validmind/tests/data_validation/WOEBinPlots.py +14 -1
  17. validmind/tests/data_validation/WOEBinTable.py +13 -2
  18. validmind/tests/data_validation/ZivotAndrewsArch.py +13 -2
  19. validmind/tests/data_validation/nlp/CommonWords.py +14 -2
  20. validmind/tests/data_validation/nlp/LanguageDetection.py +14 -1
  21. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +13 -1
  22. validmind/tests/data_validation/nlp/Sentiment.py +13 -1
  23. validmind/tests/data_validation/nlp/StopWords.py +14 -2
  24. validmind/tests/data_validation/nlp/TextDescription.py +14 -2
  25. validmind/tests/data_validation/nlp/Toxicity.py +13 -1
  26. validmind/tests/model_validation/BertScore.py +13 -2
  27. validmind/tests/model_validation/BleuScore.py +13 -2
  28. validmind/tests/model_validation/ContextualRecall.py +13 -1
  29. validmind/tests/model_validation/MeteorScore.py +13 -2
  30. validmind/tests/model_validation/ModelPredictionResiduals.py +14 -1
  31. validmind/tests/model_validation/RegardScore.py +13 -2
  32. validmind/tests/model_validation/RougeScore.py +14 -1
  33. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +14 -1
  34. validmind/tests/model_validation/ToxicityScore.py +13 -1
  35. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +14 -2
  36. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +13 -2
  37. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +14 -2
  38. validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py +14 -1
  39. validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +14 -1
  40. validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py +14 -1
  41. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +14 -1
  42. validmind/tests/output.py +9 -2
  43. validmind/tests/plots/BoxPlot.py +260 -0
  44. validmind/tests/plots/CorrelationHeatmap.py +235 -0
  45. validmind/tests/plots/HistogramPlot.py +233 -0
  46. validmind/tests/plots/ViolinPlot.py +125 -0
  47. validmind/tests/plots/__init__.py +0 -0
  48. validmind/tests/stats/CorrelationAnalysis.py +251 -0
  49. validmind/tests/stats/DescriptiveStats.py +197 -0
  50. validmind/tests/stats/NormalityTests.py +147 -0
  51. validmind/tests/stats/OutlierDetection.py +173 -0
  52. validmind/tests/stats/__init__.py +0 -0
  53. validmind/unit_metrics/classification/individual/AbsoluteError.py +42 -0
  54. validmind/unit_metrics/classification/individual/BrierScore.py +56 -0
  55. validmind/unit_metrics/classification/individual/CalibrationError.py +77 -0
  56. validmind/unit_metrics/classification/individual/ClassBalance.py +65 -0
  57. validmind/unit_metrics/classification/individual/Confidence.py +52 -0
  58. validmind/unit_metrics/classification/individual/Correctness.py +41 -0
  59. validmind/unit_metrics/classification/individual/LogLoss.py +61 -0
  60. validmind/unit_metrics/classification/individual/OutlierScore.py +86 -0
  61. validmind/unit_metrics/classification/individual/ProbabilityError.py +54 -0
  62. validmind/unit_metrics/classification/individual/Uncertainty.py +60 -0
  63. validmind/unit_metrics/classification/individual/__init__.py +0 -0
  64. validmind/vm_models/dataset/dataset.py +147 -1
  65. validmind/vm_models/result/result.py +30 -6
  66. validmind-2.10.0rc1.dist-info/METADATA +845 -0
  67. {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/RECORD +70 -49
  68. validmind-2.8.29.dist-info/METADATA +0 -137
  69. {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/LICENSE +0 -0
  70. {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/WHEEL +0 -0
  71. {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,65 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List
6
+
7
+ import numpy as np
8
+
9
+ from validmind import tags, tasks
10
+ from validmind.vm_models import VMDataset, VMModel
11
+
12
+
13
+ @tasks("classification")
14
+ @tags("classification")
15
+ def ClassBalance(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
16
+ """Calculates the class balance score per row for a classification model.
17
+
18
+ For each prediction, this returns how balanced the predicted class is in the
19
+ training distribution. Lower scores indicate predictions on rare classes,
20
+ higher scores indicate predictions on common classes. This helps understand
21
+ if model errors are more likely on imbalanced classes.
22
+
23
+ Args:
24
+ model: The classification model to evaluate
25
+ dataset: The dataset containing true labels and predictions
26
+ **kwargs: Additional parameters (unused for compatibility)
27
+
28
+ Returns:
29
+ List[float]: Per-row class balance scores as a list of float values
30
+
31
+ Note:
32
+ Scores range from 0 to 0.5, where 0.5 indicates perfectly balanced classes
33
+ and lower values indicate more imbalanced classes.
34
+ """
35
+ y_true = dataset.y
36
+ y_pred = dataset.y_pred(model)
37
+
38
+ # Convert to numpy arrays
39
+ y_true = np.asarray(y_true)
40
+ y_pred = np.asarray(y_pred)
41
+
42
+ # Calculate class frequencies in the true labels (proxy for training distribution)
43
+ unique_classes, class_counts = np.unique(y_true, return_counts=True)
44
+ class_frequencies = class_counts / len(y_true)
45
+
46
+ # Create a mapping from class to frequency
47
+ class_to_freq = dict(zip(unique_classes, class_frequencies))
48
+
49
+ # Calculate balance score for each prediction
50
+ balance_scores = []
51
+
52
+ for pred in y_pred:
53
+ if pred in class_to_freq:
54
+ freq = class_to_freq[pred]
55
+ # Balance score: how close to 0.5 (perfectly balanced) the frequency is
56
+ # Score = 0.5 - |freq - 0.5| = min(freq, 1-freq)
57
+ balance_score = min(freq, 1 - freq)
58
+ else:
59
+ # Predicted class not seen in true labels (very rare)
60
+ balance_score = 0.0
61
+
62
+ balance_scores.append(balance_score)
63
+
64
+ # Return as a list of floats
65
+ return balance_scores
@@ -0,0 +1,52 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List
6
+
7
+ import numpy as np
8
+
9
+ from validmind import tags, tasks
10
+ from validmind.vm_models import VMDataset, VMModel
11
+
12
+
13
+ @tasks("classification")
14
+ @tags("classification")
15
+ def Confidence(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
16
+ """Calculates the prediction confidence per row for a classification model.
17
+
18
+ For binary classification, confidence is calculated as the maximum probability
19
+ across classes, or alternatively as the distance from the decision boundary (0.5).
20
+ Higher values indicate more confident predictions.
21
+
22
+ Args:
23
+ model: The classification model to evaluate
24
+ dataset: The dataset containing true labels and predicted probabilities
25
+ **kwargs: Additional parameters (unused for compatibility)
26
+
27
+ Returns:
28
+ List[float]: Per-row confidence scores as a list of float values
29
+
30
+ Raises:
31
+ ValueError: If probability column is not found for the model
32
+ """
33
+ # Try to get probabilities, fall back to predictions if not available
34
+ try:
35
+ y_prob = dataset.y_prob(model)
36
+ # For binary classification, use max probability approach
37
+ if y_prob.ndim > 1 and y_prob.shape[1] > 1:
38
+ # Multi-class: confidence is the maximum probability
39
+ confidence = np.max(y_prob, axis=1)
40
+ else:
41
+ # Binary classification: confidence based on distance from 0.5
42
+ y_prob = np.asarray(y_prob, dtype=float)
43
+ confidence = np.abs(y_prob - 0.5) + 0.5
44
+ except ValueError:
45
+ # Fall back to binary correctness if probabilities not available
46
+ y_true = dataset.y
47
+ y_pred = dataset.y_pred(model)
48
+ # If no probabilities, confidence is 1.0 for correct, 0.0 for incorrect
49
+ confidence = (y_true == y_pred).astype(float)
50
+
51
+ # Return as a list of floats
52
+ return confidence.tolist()
@@ -0,0 +1,41 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List
6
+
7
+ import numpy as np
8
+
9
+ from validmind import tags, tasks
10
+ from validmind.vm_models import VMDataset, VMModel
11
+
12
+
13
+ @tasks("classification")
14
+ @tags("classification")
15
+ def Correctness(model: VMModel, dataset: VMDataset, **kwargs) -> List[int]:
16
+ """Calculates the correctness per row for a classification model.
17
+
18
+ For classification tasks, this returns 1 for correctly classified rows
19
+ and 0 for incorrectly classified rows. This provides a binary indicator
20
+ of model performance for each individual prediction.
21
+
22
+ Args:
23
+ model: The classification model to evaluate
24
+ dataset: The dataset containing true labels and predictions
25
+ **kwargs: Additional parameters (unused for compatibility)
26
+
27
+ Returns:
28
+ List[int]: Per-row correctness as a list of 1s and 0s
29
+ """
30
+ y_true = dataset.y
31
+ y_pred = dataset.y_pred(model)
32
+
33
+ # Convert to numpy arrays
34
+ y_true = np.asarray(y_true)
35
+ y_pred = np.asarray(y_pred)
36
+
37
+ # For classification, check if predictions match true labels
38
+ correctness = (y_true == y_pred).astype(int)
39
+
40
+ # Return as a list of integers
41
+ return correctness.tolist()
@@ -0,0 +1,61 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List
6
+
7
+ import numpy as np
8
+
9
+ from validmind import tags, tasks
10
+ from validmind.vm_models import VMDataset, VMModel
11
+
12
+
13
+ @tasks("classification")
14
+ @tags("classification")
15
+ def LogLoss(
16
+ model: VMModel, dataset: VMDataset, eps: float = 1e-15, **kwargs
17
+ ) -> List[float]:
18
+ """Calculates the logarithmic loss per row for a classification model.
19
+
20
+ Log loss measures the performance of a classification model where the prediction
21
+ is a probability value between 0 and 1. The log loss increases as the predicted
22
+ probability diverges from the actual label.
23
+
24
+ Args:
25
+ model: The classification model to evaluate
26
+ dataset: The dataset containing true labels and predicted probabilities
27
+ eps: Small value to avoid log(0), defaults to 1e-15
28
+ **kwargs: Additional parameters (unused for compatibility)
29
+
30
+ Returns:
31
+ List[float]: Per-row log loss values as a list of float values
32
+
33
+ Raises:
34
+ ValueError: If probability column is not found for the model
35
+ """
36
+ y_true = dataset.y
37
+
38
+ # Try to get probabilities
39
+ try:
40
+ y_prob = dataset.y_prob(model)
41
+ # For binary classification, use the positive class probability
42
+ if y_prob.ndim > 1 and y_prob.shape[1] > 1:
43
+ y_prob = y_prob[:, 1] # Use probability of positive class
44
+ except ValueError:
45
+ # Fall back to predictions if probabilities not available
46
+ # Convert predictions to "probabilities" (0.99 for correct class, 0.01 for wrong)
47
+ y_pred = dataset.y_pred(model)
48
+ y_prob = np.where(y_true == y_pred, 0.99, 0.01)
49
+
50
+ # Convert to numpy arrays and ensure same data type
51
+ y_true = np.asarray(y_true, dtype=float)
52
+ y_prob = np.asarray(y_prob, dtype=float)
53
+
54
+ # Clip probabilities to avoid log(0) and log(1)
55
+ y_prob = np.clip(y_prob, eps, 1 - eps)
56
+
57
+ # Calculate log loss per row: -[y*log(p) + (1-y)*log(1-p)]
58
+ log_loss_per_row = -(y_true * np.log(y_prob) + (1 - y_true) * np.log(1 - y_prob))
59
+
60
+ # Return as a list of floats
61
+ return log_loss_per_row.tolist()
@@ -0,0 +1,86 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List
6
+
7
+ import numpy as np
8
+ from sklearn.ensemble import IsolationForest
9
+ from sklearn.preprocessing import StandardScaler
10
+
11
+ from validmind import tags, tasks
12
+ from validmind.vm_models import VMDataset, VMModel
13
+
14
+
15
+ @tasks("classification")
16
+ @tags("classification")
17
+ def OutlierScore(
18
+ model: VMModel, dataset: VMDataset, contamination: float = 0.1, **kwargs
19
+ ) -> List[float]:
20
+ """Calculates the outlier score per row for a classification model.
21
+
22
+ Uses Isolation Forest to identify samples that deviate significantly from
23
+ the typical patterns in the feature space. Higher scores indicate more
24
+ anomalous/outlier-like samples. This can help identify out-of-distribution
25
+ samples or data points that might be harder to predict accurately.
26
+
27
+ Args:
28
+ model: The classification model to evaluate (unused but kept for consistency)
29
+ dataset: The dataset containing feature data
30
+ contamination: Expected proportion of outliers, defaults to 0.1
31
+ **kwargs: Additional parameters (unused for compatibility)
32
+
33
+ Returns:
34
+ List[float]: Per-row outlier scores as a list of float values
35
+
36
+ Note:
37
+ Scores are normalized to [0, 1] where higher values indicate more outlier-like samples
38
+ """
39
+ # Get feature data
40
+ X = dataset.x_df()
41
+
42
+ # Handle case where we have no features or only categorical features
43
+ if X.empty or X.shape[1] == 0:
44
+ # Return zero outlier scores if no features available
45
+ return [0.0] * len(dataset.y)
46
+
47
+ # Select only numeric features for outlier detection
48
+ numeric_features = dataset.feature_columns_numeric
49
+ if not numeric_features:
50
+ # If no numeric features, return zero outlier scores
51
+ return [0.0] * len(dataset.y)
52
+
53
+ X_numeric = X[numeric_features]
54
+
55
+ # Handle missing values by filling with median
56
+ X_filled = X_numeric.fillna(X_numeric.median())
57
+
58
+ # Standardize features for better outlier detection
59
+ scaler = StandardScaler()
60
+ X_scaled = scaler.fit_transform(X_filled)
61
+
62
+ # Fit Isolation Forest
63
+ isolation_forest = IsolationForest(
64
+ contamination=contamination, random_state=42, n_estimators=100
65
+ )
66
+
67
+ # Fit the model on the data
68
+ isolation_forest.fit(X_scaled)
69
+
70
+ # Get anomaly scores (negative values for outliers)
71
+ anomaly_scores = isolation_forest.decision_function(X_scaled)
72
+
73
+ # Convert to outlier scores (0 to 1, where 1 is most outlier-like)
74
+ # Normalize using min-max scaling
75
+ min_score = np.min(anomaly_scores)
76
+ max_score = np.max(anomaly_scores)
77
+
78
+ if max_score == min_score:
79
+ # All samples have same score, no outliers detected
80
+ outlier_scores = np.zeros_like(anomaly_scores)
81
+ else:
82
+ # Invert and normalize: higher values = more outlier-like
83
+ outlier_scores = (max_score - anomaly_scores) / (max_score - min_score)
84
+
85
+ # Return as a list of floats
86
+ return outlier_scores.tolist()
@@ -0,0 +1,54 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List
6
+
7
+ import numpy as np
8
+
9
+ from validmind import tags, tasks
10
+ from validmind.vm_models import VMDataset, VMModel
11
+
12
+
13
+ @tasks("classification")
14
+ @tags("classification")
15
+ def ProbabilityError(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
16
+ """Calculates the probability error per row for a classification model.
17
+
18
+ For binary classification tasks, this computes the absolute difference between
19
+ the true class labels (0 or 1) and the predicted probabilities for each row.
20
+ This provides insight into how confident the model's predictions are and
21
+ how far off they are from the actual labels.
22
+
23
+ Args:
24
+ model: The classification model to evaluate
25
+ dataset: The dataset containing true labels and predicted probabilities
26
+ **kwargs: Additional parameters (unused for compatibility)
27
+
28
+ Returns:
29
+ List[float]: Per-row probability errors as a list of float values
30
+
31
+ Raises:
32
+ ValueError: If probability column is not found for the model
33
+ """
34
+ y_true = dataset.y
35
+
36
+ # Try to get probabilities, fall back to predictions if not available
37
+ try:
38
+ y_prob = dataset.y_prob(model)
39
+ # For binary classification, use the positive class probability
40
+ if y_prob.ndim > 1 and y_prob.shape[1] > 1:
41
+ y_prob = y_prob[:, 1] # Use probability of positive class
42
+ except ValueError:
43
+ # Fall back to predictions if probabilities not available
44
+ y_prob = dataset.y_pred(model)
45
+
46
+ # Convert to numpy arrays and ensure same data type
47
+ y_true = np.asarray(y_true, dtype=float)
48
+ y_prob = np.asarray(y_prob, dtype=float)
49
+
50
+ # Compute absolute difference between true labels and predicted probabilities
51
+ probability_errors = np.abs(y_true - y_prob)
52
+
53
+ # Return as a list of floats
54
+ return probability_errors.tolist()
@@ -0,0 +1,60 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List
6
+
7
+ import numpy as np
8
+
9
+ from validmind import tags, tasks
10
+ from validmind.vm_models import VMDataset, VMModel
11
+
12
+
13
+ @tasks("classification")
14
+ @tags("classification")
15
+ def Uncertainty(model: VMModel, dataset: VMDataset, **kwargs) -> List[float]:
16
+ """Calculates the prediction uncertainty per row for a classification model.
17
+
18
+ Uncertainty is measured using the entropy of the predicted probability distribution.
19
+ Higher entropy indicates higher uncertainty in the prediction. For binary
20
+ classification, maximum uncertainty occurs at probability 0.5.
21
+
22
+ Args:
23
+ model: The classification model to evaluate
24
+ dataset: The dataset containing true labels and predicted probabilities
25
+ **kwargs: Additional parameters (unused for compatibility)
26
+
27
+ Returns:
28
+ List[float]: Per-row uncertainty scores as a list of float values
29
+
30
+ Raises:
31
+ ValueError: If probability column is not found for the model
32
+ """
33
+ # Try to get probabilities
34
+ try:
35
+ y_prob = dataset.y_prob(model)
36
+
37
+ if y_prob.ndim > 1 and y_prob.shape[1] > 1:
38
+ # Multi-class: calculate entropy across all classes
39
+ # Clip to avoid log(0)
40
+ y_prob_clipped = np.clip(y_prob, 1e-15, 1 - 1e-15)
41
+ # Entropy: -sum(p * log(p))
42
+ uncertainty = -np.sum(y_prob_clipped * np.log(y_prob_clipped), axis=1)
43
+ else:
44
+ # Binary classification: calculate binary entropy
45
+ y_prob = np.asarray(y_prob, dtype=float)
46
+ # Clip to avoid log(0)
47
+ y_prob_clipped = np.clip(y_prob, 1e-15, 1 - 1e-15)
48
+ # Binary entropy: -[p*log(p) + (1-p)*log(1-p)]
49
+ uncertainty = -(
50
+ y_prob_clipped * np.log(y_prob_clipped)
51
+ + (1 - y_prob_clipped) * np.log(1 - y_prob_clipped)
52
+ )
53
+
54
+ except ValueError:
55
+ # If no probabilities available, assume zero uncertainty for hard predictions
56
+ n_samples = len(dataset.y)
57
+ uncertainty = np.zeros(n_samples)
58
+
59
+ # Return as a list of floats
60
+ return uncertainty.tolist()
@@ -8,7 +8,7 @@ Dataset class wrapper
8
8
 
9
9
  import warnings
10
10
  from copy import deepcopy
11
- from typing import Any, Dict, Optional
11
+ from typing import Any, Dict, List, Optional, Union
12
12
 
13
13
  import numpy as np
14
14
  import pandas as pd
@@ -458,6 +458,152 @@ class VMDataset(VMInput):
458
458
 
459
459
  return self.extra_columns.probability_column(model, column_name)
460
460
 
461
+ def assign_scores(
462
+ self,
463
+ model: VMModel,
464
+ metrics: Union[str, List[str]],
465
+ **kwargs: Dict[str, Any],
466
+ ) -> None:
467
+ """Assign computed unit metric scores to the dataset as new columns.
468
+
469
+ This method computes unit metrics for the given model and dataset, then adds
470
+ the computed scores as new columns to the dataset using the naming convention:
471
+ {model.input_id}_{metric_name}
472
+
473
+ Args:
474
+ model (VMModel): The model used to compute the scores.
475
+ metrics (Union[str, List[str]]): Single metric ID or list of metric IDs.
476
+ Can be either:
477
+ - Short name (e.g., "F1", "Precision")
478
+ - Full metric ID (e.g., "validmind.unit_metrics.classification.F1")
479
+ **kwargs: Additional parameters passed to the unit metrics.
480
+
481
+ Examples:
482
+ # Single metric
483
+ dataset.assign_scores(model, "F1")
484
+
485
+ # Multiple metrics
486
+ dataset.assign_scores(model, ["F1", "Precision", "Recall"])
487
+
488
+ # With parameters
489
+ dataset.assign_scores(model, "ROC_AUC", average="weighted")
490
+
491
+ Raises:
492
+ ValueError: If the model input_id is None or if metric computation fails.
493
+ ImportError: If unit_metrics module cannot be imported.
494
+ """
495
+ if model.input_id is None:
496
+ raise ValueError("Model input_id must be set to use assign_scores")
497
+
498
+ # Import unit_metrics module
499
+ try:
500
+ from validmind.unit_metrics import run_metric
501
+ except ImportError as e:
502
+ raise ImportError(
503
+ f"Failed to import unit_metrics module: {e}. "
504
+ "Make sure validmind.unit_metrics is available."
505
+ ) from e
506
+
507
+ # Normalize metrics to a list
508
+ if isinstance(metrics, str):
509
+ metrics = [metrics]
510
+
511
+ # Process each metric
512
+ for metric in metrics:
513
+ # Normalize metric ID
514
+ metric_id = self._normalize_metric_id(metric)
515
+
516
+ # Extract metric name for column naming
517
+ metric_name = self._extract_metric_name(metric_id)
518
+
519
+ # Generate column name
520
+ column_name = f"{model.input_id}_{metric_name}"
521
+
522
+ try:
523
+ # Run the unit metric
524
+ result = run_metric(
525
+ metric_id,
526
+ inputs={
527
+ "model": model,
528
+ "dataset": self,
529
+ },
530
+ params=kwargs,
531
+ show=False, # Don't show widget output
532
+ )
533
+
534
+ # Extract the metric value
535
+ metric_value = result.metric
536
+
537
+ # Create column values (repeat the scalar value for all rows)
538
+ if np.isscalar(metric_value):
539
+ column_values = np.full(len(self._df), metric_value)
540
+ else:
541
+ if len(metric_value) != len(self._df):
542
+ raise ValueError(
543
+ f"Metric value length {len(metric_value)} does not match dataset length {len(self._df)}"
544
+ )
545
+ column_values = metric_value
546
+
547
+ # Add the column to the dataset
548
+ self.add_extra_column(column_name, column_values)
549
+
550
+ logger.info(f"Added metric column '{column_name}'")
551
+ except Exception as e:
552
+ logger.error(f"Failed to compute metric {metric_id}: {e}")
553
+ raise ValueError(f"Failed to compute metric {metric_id}: {e}") from e
554
+
555
+ def _normalize_metric_id(self, metric: str) -> str:
556
+ """Normalize metric identifier to full validmind unit metric ID.
557
+
558
+ Args:
559
+ metric (str): Metric identifier (short name or full ID)
560
+
561
+ Returns:
562
+ str: Full metric ID
563
+ """
564
+ # If already a full ID, return as-is
565
+ if metric.startswith("validmind.unit_metrics."):
566
+ return metric
567
+
568
+ # Try to find the metric by short name
569
+ try:
570
+ from validmind.unit_metrics import list_metrics
571
+
572
+ available_metrics = list_metrics()
573
+
574
+ # Look for exact match with short name
575
+ for metric_id in available_metrics:
576
+ if metric_id.endswith(f".{metric}"):
577
+ return metric_id
578
+
579
+ # If no exact match found, raise error with suggestions
580
+ suggestions = [m for m in available_metrics if metric.lower() in m.lower()]
581
+ if suggestions:
582
+ raise ValueError(
583
+ f"Metric '{metric}' not found. Did you mean one of: {suggestions[:5]}"
584
+ )
585
+ else:
586
+ raise ValueError(
587
+ f"Metric '{metric}' not found. Available metrics: {available_metrics[:10]}..."
588
+ )
589
+
590
+ except ImportError as e:
591
+ raise ImportError(
592
+ f"Failed to import unit_metrics for metric lookup: {e}"
593
+ ) from e
594
+
595
+ def _extract_metric_name(self, metric_id: str) -> str:
596
+ """Extract the metric name from a full metric ID.
597
+
598
+ Args:
599
+ metric_id (str): Full metric ID
600
+
601
+ Returns:
602
+ str: Metric name
603
+ """
604
+ # Extract the last part after the final dot
605
+ return metric_id.split(".")[-1]
606
+
461
607
  def add_extra_column(self, column_name, column_values=None):
462
608
  """Adds an extra column to the dataset without modifying the dataset `features` and `target` columns.
463
609
 
@@ -7,6 +7,7 @@ Result objects for test results
7
7
  """
8
8
  import asyncio
9
9
  import json
10
+ import os
10
11
  from abc import abstractmethod
11
12
  from dataclasses import dataclass
12
13
  from typing import Any, Dict, List, Optional, Union
@@ -20,7 +21,7 @@ from ipywidgets import HTML, VBox
20
21
  from ... import api_client
21
22
  from ...ai.utils import DescriptionFuture
22
23
  from ...errors import InvalidParameterError
23
- from ...logging import get_logger
24
+ from ...logging import get_logger, log_api_operation
24
25
  from ...utils import (
25
26
  HumanReadableEncoder,
26
27
  NumpyEncoder,
@@ -177,7 +178,7 @@ class TestResult(Result):
177
178
  title: Optional[str] = None
178
179
  doc: Optional[str] = None
179
180
  description: Optional[Union[str, DescriptionFuture]] = None
180
- metric: Optional[Union[int, float]] = None
181
+ metric: Optional[Union[int, float, List[Union[int, float]]]] = None
181
182
  tables: Optional[List[ResultTable]] = None
182
183
  raw_data: Optional[RawData] = None
183
184
  figures: Optional[List[Figure]] = None
@@ -464,8 +465,10 @@ class TestResult(Result):
464
465
  )
465
466
  )
466
467
 
467
- if self.metric is not None:
468
- # metrics are logged as separate entities
468
+ # Only log unit metrics when the metric is a scalar value.
469
+ # Some tests may assign a list/array of per-row metrics to `self.metric`.
470
+ # Those should not be sent to the unit-metric endpoint which expects scalars.
471
+ if self.metric is not None and not hasattr(self.metric, "__len__"):
469
472
  tasks.append(
470
473
  api_client.alog_metric(
471
474
  key=self.result_id,
@@ -476,9 +479,30 @@ class TestResult(Result):
476
479
  )
477
480
 
478
481
  if self.figures:
479
- tasks.extend(
480
- [api_client.alog_figure(figure) for figure in (self.figures or [])]
482
+ batch_size = min(
483
+ len(self.figures), int(os.getenv("VM_FIGURE_MAX_BATCH_SIZE", 20))
481
484
  )
485
+ figure_batches = [
486
+ self.figures[i : i + batch_size]
487
+ for i in range(0, len(self.figures), batch_size)
488
+ ]
489
+
490
+ async def upload_figures_in_batches():
491
+ for batch in figure_batches:
492
+
493
+ @log_api_operation(
494
+ operation_name=f"Uploading batch of {len(batch)} figures"
495
+ )
496
+ async def process_batch():
497
+ batch_tasks = [
498
+ api_client.alog_figure(figure) for figure in batch
499
+ ]
500
+ return await asyncio.gather(*batch_tasks)
501
+
502
+ await process_batch()
503
+
504
+ tasks.append(upload_figures_in_batches())
505
+
482
506
  if self.description:
483
507
  revision_name = (
484
508
  AI_REVISION_NAME