validmind 2.5.6__py3-none-any.whl → 2.5.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +26 -7
  3. validmind/api_client.py +89 -43
  4. validmind/client.py +2 -2
  5. validmind/client_config.py +11 -14
  6. validmind/datasets/regression/fred_timeseries.py +67 -138
  7. validmind/template.py +1 -0
  8. validmind/test_suites/__init__.py +0 -2
  9. validmind/test_suites/statsmodels_timeseries.py +1 -1
  10. validmind/test_suites/summarization.py +0 -1
  11. validmind/test_suites/time_series.py +0 -43
  12. validmind/tests/__types__.py +3 -13
  13. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  14. validmind/tests/data_validation/ADF.py +31 -24
  15. validmind/tests/data_validation/AutoAR.py +9 -9
  16. validmind/tests/data_validation/AutoMA.py +23 -16
  17. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  18. validmind/tests/data_validation/AutoStationarity.py +21 -16
  19. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  20. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
  21. validmind/tests/data_validation/ClassImbalance.py +15 -12
  22. validmind/tests/data_validation/DFGLSArch.py +19 -13
  23. validmind/tests/data_validation/DatasetDescription.py +17 -11
  24. validmind/tests/data_validation/DatasetSplit.py +7 -5
  25. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  26. validmind/tests/data_validation/Duplicates.py +33 -25
  27. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  28. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  29. validmind/tests/data_validation/HighCardinality.py +19 -12
  30. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  31. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  32. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  33. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  34. validmind/tests/data_validation/KPSS.py +34 -29
  35. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  36. validmind/tests/data_validation/MissingValues.py +32 -27
  37. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  38. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  39. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  40. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  41. validmind/tests/data_validation/ScatterPlot.py +63 -78
  42. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  43. validmind/tests/data_validation/Skewness.py +35 -37
  44. validmind/tests/data_validation/SpreadPlot.py +35 -35
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  47. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  48. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  49. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  50. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  51. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  52. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  53. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  54. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  55. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  56. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  57. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  58. validmind/tests/data_validation/UniqueRows.py +11 -6
  59. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  60. validmind/tests/data_validation/WOEBinTable.py +35 -30
  61. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  62. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  63. validmind/tests/data_validation/nlp/Hashtags.py +27 -20
  64. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  65. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  66. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  67. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  68. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  69. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  70. validmind/tests/data_validation/nlp/TextDescription.py +36 -35
  71. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  72. validmind/tests/decorator.py +81 -42
  73. validmind/tests/model_validation/BertScore.py +36 -27
  74. validmind/tests/model_validation/BleuScore.py +25 -19
  75. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  76. validmind/tests/model_validation/ContextualRecall.py +35 -13
  77. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  78. validmind/tests/model_validation/MeteorScore.py +46 -33
  79. validmind/tests/model_validation/ModelMetadata.py +32 -64
  80. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  81. validmind/tests/model_validation/RegardScore.py +30 -14
  82. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  83. validmind/tests/model_validation/RougeScore.py +36 -30
  84. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  85. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  86. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  87. validmind/tests/model_validation/TokenDisparity.py +31 -23
  88. validmind/tests/model_validation/ToxicityScore.py +26 -17
  89. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  90. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  91. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  92. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  93. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  94. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  95. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  96. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  97. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  98. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  99. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  100. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  101. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  102. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  103. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  104. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  105. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  106. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  107. validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
  108. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  109. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  110. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  111. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  112. validmind/tests/model_validation/ragas/utils.py +6 -0
  113. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  114. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  115. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  116. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  117. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  118. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  119. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  120. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  121. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  122. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  123. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  124. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  125. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  126. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  127. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  128. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  129. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  130. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  131. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
  132. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  133. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  134. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  135. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  136. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  137. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  138. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
  139. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  140. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +113 -73
  141. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
  142. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  143. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  144. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  145. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  146. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  147. validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
  148. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  149. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
  150. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  151. validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
  152. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  153. validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
  154. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  155. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
  156. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  157. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  158. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  159. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  160. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  161. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  162. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  163. validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
  164. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  165. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
  166. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  167. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  168. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  169. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  170. validmind/tests/prompt_validation/Bias.py +14 -11
  171. validmind/tests/prompt_validation/Clarity.py +16 -14
  172. validmind/tests/prompt_validation/Conciseness.py +7 -5
  173. validmind/tests/prompt_validation/Delimitation.py +23 -22
  174. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  175. validmind/tests/prompt_validation/Robustness.py +12 -10
  176. validmind/tests/prompt_validation/Specificity.py +13 -11
  177. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  178. validmind/tests/run.py +68 -23
  179. validmind/unit_metrics/__init__.py +81 -144
  180. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  181. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  182. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  183. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  184. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  185. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  186. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  187. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  188. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  189. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  190. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  191. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  192. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  193. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  194. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  195. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  196. validmind/vm_models/dataset/dataset.py +2 -0
  197. validmind/vm_models/figure.py +5 -0
  198. validmind/vm_models/test/result_wrapper.py +93 -132
  199. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
  200. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
  201. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  202. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  203. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  204. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  205. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  206. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  207. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  208. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  209. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  210. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
  211. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
  212. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -15,29 +15,36 @@ class AdjustedMutualInformation(ClusterPerformance):
15
15
  Evaluates clustering model performance by measuring mutual information between true and predicted labels, adjusting
16
16
  for chance.
17
17
 
18
- **1. Purpose**: The purpose of this metric (Adjusted Mutual Information) is to evaluate the performance of a
19
- machine learning model, more specifically, a clustering model. It measures the mutual information between the true
20
- labels and the ones predicted by the model, adjusting for chance.
18
+ ### Purpose
21
19
 
22
- **2. Test Mechanism**: The Adjusted Mutual Information (AMI) uses sklearn's `adjusted_mutual_info_score` function.
23
- This function calculates the mutual information between the true labels and the ones predicted while correcting for
24
- the chance correlation expected due to random label assignments. This test requires the model, the training
25
- dataset, and the test dataset as inputs.
20
+ The purpose of this metric (Adjusted Mutual Information) is to evaluate the performance of a machine learning
21
+ model, more specifically, a clustering model. It measures the mutual information between the true labels and the
22
+ ones predicted by the model, adjusting for chance.
23
+
24
+ ### Test Mechanism
25
+
26
+ The Adjusted Mutual Information (AMI) uses sklearn's `adjusted_mutual_info_score` function. This function
27
+ calculates the mutual information between the true labels and the ones predicted while correcting for the chance
28
+ correlation expected due to random label assignments. This test requires the model, the training dataset, and the
29
+ test dataset as inputs.
30
+
31
+ ### Signs of High Risk
26
32
 
27
- **3. Signs of High Risk**:
28
33
  - Low Adjusted Mutual Information Score: This score ranges between 0 and 1. A low score (closer to 0) can indicate
29
34
  poor model performance as the predicted labels do not align well with the true labels.
30
- - In case of high dimensional data, if the algorithm shows high scores, this could also be a potential risk as AMI
35
+ - In case of high-dimensional data, if the algorithm shows high scores, this could also be a potential risk as AMI
31
36
  may not perform reliably.
32
37
 
33
- **4. Strengths**:
38
+ ### Strengths
39
+
34
40
  - The AMI metric takes into account the randomness of the predicted labels, which makes it more robust than the
35
41
  simple Mutual Information.
36
42
  - The scale of AMI is not dependent on the sizes of the clustering, allowing for comparability between different
37
43
  datasets or models.
38
44
  - Good for comparing the output of clustering algorithms where the number of clusters is not known a priori.
39
45
 
40
- **5. Limitations**:
46
+ ### Limitations
47
+
41
48
  - Adjusted Mutual Information does not take into account the continuous nature of some data. As a result, it may
42
49
  not be the best choice for regression or other continuous types of tasks.
43
50
  - AMI has the drawback of being biased towards clusterings with a higher number of clusters.
@@ -47,7 +54,7 @@ class AdjustedMutualInformation(ClusterPerformance):
47
54
  """
48
55
 
49
56
  name = "adjusted_mutual_information"
50
- required_inputs = ["model", "datasets"]
57
+ required_inputs = ["model", "dataset"]
51
58
  tasks = ["clustering"]
52
59
  tags = [
53
60
  "sklearn",
@@ -15,38 +15,43 @@ class AdjustedRandIndex(ClusterPerformance):
15
15
  Measures the similarity between two data clusters using the Adjusted Rand Index (ARI) metric in clustering machine
16
16
  learning models.
17
17
 
18
- **1. Purpose:**
18
+ ### Purpose
19
+
19
20
  The Adjusted Rand Index (ARI) metric is intended to measure the similarity between two data clusters. This metric
20
- is specifically being used for clustering machine learning models to validly quantify how well the model is
21
- clustering and producing data groups. It involves comparing the model's produced clusters against the actual (true)
22
- clusters found in the dataset.
21
+ is specifically used for clustering machine learning models to quantify how well the model is clustering and
22
+ producing data groups. It involves comparing the model's produced clusters against the actual (true) clusters found
23
+ in the dataset.
24
+
25
+ ### Test Mechanism
26
+
27
+ The Adjusted Rand Index (ARI) is calculated using the `adjusted_rand_score` method from the `sklearn.metrics`
28
+ module in Python. The test requires inputs including the model itself and the model's training and test datasets.
29
+ The model's computed clusters and the true clusters are compared, and the similarities are measured to compute the
30
+ ARI.
23
31
 
24
- **2. Test Mechanism:**
25
- The Adjusted Rand Index (ARI) is calculated by using the `adjusted_rand_score` method from the sklearn metrics in
26
- Python. The test requires inputs including the model itself and the model's training and test datasets. The model's
27
- computed clusters and the true clusters are compared, and the similarities are measured to compute the ARI.
32
+ ### Signs of High Risk
28
33
 
29
- **3. Signs of High Risk:**
30
- - If the ARI is close to zero, it signifies that the model's cluster assignments are random and don't match the
34
+ - If the ARI is close to zero, it signifies that the model's cluster assignments are random and do not match the
31
35
  actual dataset clusters, indicating a high risk.
32
36
  - An ARI of less than zero indicates that the model's clustering performance is worse than random.
33
37
 
34
- **4. Strengths:**
35
- - ARI is normalized and it hence gives a consistent metric between -1 and +1, irrespective of raw cluster sizes or
38
+ ### Strengths
39
+
40
+ - ARI is normalized and provides a consistent metric between -1 and +1, irrespective of raw cluster sizes or
36
41
  dataset size variations.
37
- - It doesn’t require a ground truth for computation which makes it ideal for unsupervised learning model
38
- evaluations.
42
+ - It does not require a ground truth for computation, making it ideal for unsupervised learning model evaluations.
39
43
  - It penalizes for false positives and false negatives, providing a robust measure of clustering quality.
40
44
 
41
- **5. Limitations:**
45
+ ### Limitations
46
+
42
47
  - In real-world situations, true clustering is often unknown, which can hinder the practical application of the ARI.
43
48
  - The ARI requires all individual data instances to be independent, which may not always hold true.
44
- - It may be difficult to interpret the implications of an ARI score without a context or a benchmark, as it is
49
+ - It may be difficult to interpret the implications of an ARI score without context or a benchmark, as it is
45
50
  heavily dependent on the characteristics of the dataset used.
46
51
  """
47
52
 
48
53
  name = "adjusted_rand_index"
49
- required_inputs = ["model", "datasets"]
54
+ required_inputs = ["model", "dataset"]
50
55
  tasks = ["clustering"]
51
56
  tags = [
52
57
  "sklearn",
@@ -24,36 +24,38 @@ class ClassifierPerformance(Metric):
24
24
  Evaluates performance of binary or multiclass classification models using precision, recall, F1-Score, accuracy,
25
25
  and ROC AUC scores.
26
26
 
27
- **Purpose**: The supplied script is designed to evaluate the performance of Machine Learning classification models.
27
+ ### Purpose
28
+
29
+ The Classifier Performance test is designed to evaluate the performance of Machine Learning classification models.
28
30
  It accomplishes this by computing precision, recall, F1-Score, and accuracy, as well as the ROC AUC (Receiver
29
31
  operating characteristic - Area under the curve) scores, thereby providing a comprehensive analytic view of the
30
32
  models' performance. The test is adaptable, handling binary and multiclass models equally effectively.
31
33
 
32
- **Test Mechanism**: The script produces a report that includes precision, recall, F1-Score, and accuracy, by
33
- leveraging the `classification_report` from the scikit-learn's metrics module. For multiclass models, macro and
34
- weighted averages for these scores are also calculated. Additionally, the ROC AUC scores are calculated and
35
- included in the report using the script's unique `multiclass_roc_auc_score` function. The outcome of the test
36
- (report format) differs based on whether the model is binary or multiclass.
34
+ ### Test Mechanism
35
+
36
+ The test produces a report that includes precision, recall, F1-Score, and accuracy, by leveraging the
37
+ `classification_report` from scikit-learn's metrics module. For multiclass models, macro and weighted averages for
38
+ these scores are also calculated. Additionally, the ROC AUC scores are calculated and included in the report using
39
+ the `multiclass_roc_auc_score` function. The outcome of the test (report format) differs based on whether the model
40
+ is binary or multiclass.
41
+
42
+ ### Signs of High Risk
37
43
 
38
- **Signs of High Risk**:
39
44
  - Low values for precision, recall, F1-Score, accuracy, and ROC AUC, indicating poor performance.
40
- - Imbalance in precision and recall scores. Precision highlights correct positive class predictions, while recall
41
- indicates the accurate identification of actual positive cases. Imbalance may indicate flawed model performance.
42
- - A low ROC AUC score, especially scores close to 0.5 or lower, strongly suggests a failing model.
43
-
44
- **Strengths**:
45
- - The script is versatile, capable of assessing both binary and multiclass models.
46
- - It uses a variety of commonly employed performance metrics, offering a comprehensive view of a model's
47
- performance.
48
- - The use of ROC-AUC as a metric aids in determining the most optimal threshold for classification, especially
49
- beneficial when evaluation datasets are unbalanced.
50
-
51
- **Limitations**:
52
- - The test assumes correctly identified labels for binary classification models and raises an exception if the
53
- positive class is not labeled as "1". However, this setup may not align with all practical applications.
54
- - This script is specifically designed for classification models and is not suited to evaluate regression models.
55
- - The metrics computed may provide limited insights in cases where the test dataset does not adequately represent
56
- the data the model will encounter in real-world scenarios.
45
+ - Imbalance in precision and recall scores.
46
+ - A low ROC AUC score, especially scores close to 0.5 or lower, suggesting a failing model.
47
+
48
+ ### Strengths
49
+
50
+ - Versatile, capable of assessing both binary and multiclass models.
51
+ - Utilizes a variety of commonly employed performance metrics, offering a comprehensive view of model performance.
52
+ - The use of ROC-AUC as a metric is beneficial for evaluating unbalanced datasets.
53
+
54
+ ### Limitations
55
+
56
+ - Assumes correctly identified labels for binary classification models.
57
+ - Specifically designed for classification models and not suitable for regression models.
58
+ - May provide limited insights if the test dataset does not represent real-world scenarios adequately.
57
59
  """
58
60
 
59
61
  name = "classifier_performance"
@@ -132,7 +134,7 @@ class ClassifierPerformance(Metric):
132
134
  if len(np.unique(y_true)) > 2:
133
135
  y_pred = self.inputs.dataset.y_pred(self.inputs.model)
134
136
  y_true = y_true.astype(y_pred.dtype)
135
- roc_auc = self.multiclass_roc_auc_score(y_true, y_pred)
137
+ roc_auc = multiclass_roc_auc_score(y_true, y_pred)
136
138
  else:
137
139
  y_prob = self.inputs.dataset.y_prob(self.inputs.model)
138
140
  y_true = y_true.astype(y_prob.dtype).flatten()
@@ -16,19 +16,21 @@ class ClusterCosineSimilarity(Metric):
16
16
  """
17
17
  Measures the intra-cluster similarity of a clustering model using cosine similarity.
18
18
 
19
- **1. Purpose:**
19
+ ### Purpose
20
+
20
21
  The purpose of this metric is to measure how similar the data points within each cluster of a clustering model are.
21
22
  This is done using cosine similarity, which compares the multi-dimensional direction (but not magnitude) of data
22
23
  vectors. From a Model Risk Management perspective, this metric is used to quantitatively validate that clusters
23
24
  formed by a model have high intra-cluster similarity.
24
25
 
25
- **2. Test Mechanism:**
26
+ ### Test Mechanism
27
+
26
28
  This test works by first extracting the true and predicted clusters of the model's training data. Then, it computes
27
29
  the centroid (average data point) of each cluster. Next, it calculates the cosine similarity between each data
28
30
  point within a cluster and its respective centroid. Finally, it outputs the mean cosine similarity of each cluster,
29
31
  highlighting how similar, on average, data points in a cluster are to the cluster's centroid.
30
32
 
31
- **3. Signs of High Risk:**
33
+ ### Signs of High Risk
32
34
 
33
35
  - Low mean cosine similarity for one or more clusters: If the mean cosine similarity is low, the data points within
34
36
  the respective cluster have high variance in their directions. This can be indicative of poor clustering,
@@ -36,7 +38,7 @@ class ClusterCosineSimilarity(Metric):
36
38
  - High disparity between mean cosine similarity values across clusters: If there's a significant difference in mean
37
39
  cosine similarity across different clusters, this could indicate imbalance in how the model forms clusters.
38
40
 
39
- **4. Strengths:**
41
+ ### Strengths
40
42
 
41
43
  - Cosine similarity operates in a multi-dimensional space, making it effective for measuring similarity in high
42
44
  dimensional datasets, typical for many machine learning problems.
@@ -44,7 +46,7 @@ class ClusterCosineSimilarity(Metric):
44
46
  of each vector.
45
47
  - This metric is not dependent on the scale of the variables, making it equally effective on different scales.
46
48
 
47
- **5. Limitations:**
49
+ ### Limitations
48
50
 
49
51
  - Cosine similarity does not consider magnitudes (i.e. lengths) of vectors, only their direction. This means it may
50
52
  overlook instances where clusters have been adequately separated in terms of magnitude.
@@ -4,7 +4,7 @@
4
4
 
5
5
  from dataclasses import dataclass
6
6
 
7
- from validmind.vm_models import Metric, ResultSummary, ResultTable
7
+ from validmind.vm_models import Metric
8
8
 
9
9
 
10
10
  @dataclass
@@ -13,106 +13,68 @@ class ClusterPerformance(Metric):
13
13
  Evaluates and compares a clustering model's performance on training and testing datasets using multiple defined
14
14
  metrics.
15
15
 
16
- **Purpose:** This metric, ClusterPerformance, evaluates the performance of a clustering model on both the training
17
- and testing datasets. It assesses how well the model defines, forms, and distinguishes clusters of data.
18
-
19
- **Test Mechanism:** The metric is applied by first predicting the clusters of the training and testing datasets
20
- using the clustering model. Next, performance metrics, defined in the method `metric_info()`, are calculated
21
- against the true labels of the datasets. The results for each metric for both datasets are then collated and
22
- returned in a summarized table form listing each metric along with its corresponding train and test values.
23
-
24
- **Signs of High Risk:**
25
- - High discrepancy between the performance metric values on the training and testing datasets. This could signify
26
- problems such as overfitting or underfitting.
27
- - Low performance metric values on the training and testing datasets. There might be a problem with the model
28
- itself or the chosen hyperparameters.
29
- - If the model's performance deteriorates consistently across different sets of metrics, this may suggest a broader
30
- issue with the model or the dataset.
31
-
32
- **Strengths:**
33
- - Tests the model's performance on both the training and testing datasets, which helps to identify issues such as
34
- overfitting or underfitting.
35
- - Allows for a broad range of performance metrics to be used, thus providing a comprehensive evaluation of the
36
- model's clustering capabilities.
37
- - Returns a summarized table, which makes it easy to compare the model's performance across different metrics and
38
- datasets.
39
-
40
- **Limitations:**
41
- - The method `metric_info()` needs to be properly overridden in a subclass for this class to be used, and the
42
- metrics to be used must be manually defined.
43
- - The performance metrics are calculated on predicted cluster labels, so the metric may not capture the model's
44
- performance well if the clusters are not well separated or if the model has difficulties with certain kinds of
45
- clusters.
46
- - Doesn't consider the computational and time complexity of the model. While the model may perform well in terms of
47
- the performance metrics, it might be time or resource-intensive. This metric does not account for such scenarios.
48
- - Because the comparison is binary (train and test), it might not capture scenarios where the performance changes
49
- drastically under different circumstances or categories within the dataset.
16
+ ### Purpose
17
+
18
+ The Cluster Performance test evaluates the performance of a clustering model on both the training and testing
19
+ datasets. It assesses how well the model defines, forms, and distinguishes clusters of data.
20
+
21
+ ### Test Mechanism
22
+
23
+ The test mechanism involves predicting the clusters of the training and testing datasets using the clustering
24
+ model. After prediction, performance metrics defined in the `metric_info()` method are calculated against the true
25
+ labels of the datasets. The results for each metric for both datasets are then collated and returned in a
26
+ summarized table form listing each metric along with its corresponding train and test values.
27
+
28
+ ### Signs of High Risk
29
+
30
+ - High discrepancy between the performance metric values on the training and testing datasets.
31
+ - Low performance metric values on both the training and testing datasets.
32
+ - Consistent deterioration of performance across different metrics.
33
+
34
+ ### Strengths
35
+
36
+ - Tests the model's performance on both training and testing datasets, helping to identify overfitting or
37
+ underfitting.
38
+ - Allows for the use of a broad range of performance metrics, providing a comprehensive evaluation.
39
+ - Returns a summarized table, making it easy to compare performance across different metrics and datasets.
40
+
41
+ ### Limitations
42
+
43
+ - The `metric_info()` method needs to be properly overridden in a subclass and metrics must be manually defined.
44
+ - The test may not capture the model's performance well if clusters are not well-separated or the model struggles
45
+ with certain clusters.
46
+ - Does not consider the computational and time complexity of the model.
47
+ - Binary comparison (train and test) might not capture performance changes under different circumstances or dataset
48
+ categories.
50
49
  """
51
50
 
52
51
  name = "cluster_performance_metrics"
53
- required_inputs = ["model", "datasets"]
52
+ required_inputs = ["model", "dataset"]
54
53
  tasks = ["clustering"]
55
54
  tags = [
56
55
  "sklearn",
57
56
  "model_performance",
58
57
  ]
59
58
 
60
- def cluster_performance_metrics(
61
- self, y_true_train, y_pred_train, y_true_test, y_pred_test, samples, metric_info
62
- ):
59
+ def cluster_performance_metrics(self, y_true_train, y_pred_train, metric_info):
63
60
  y_true_train = y_true_train.astype(y_pred_train.dtype).flatten()
64
- y_true_test = y_true_test.astype(y_pred_test.dtype).flatten()
65
61
  results = []
66
62
  for metric_name, metric_fcn in metric_info.items():
67
- for _ in samples:
68
- train_value = metric_fcn(list(y_true_train), y_pred_train)
69
- test_value = metric_fcn(list(y_true_test), y_pred_test)
70
- results.append(
71
- {
72
- metric_name: {
73
- "train": train_value,
74
- "test": test_value,
75
- }
76
- }
77
- )
63
+ train_value = metric_fcn(list(y_true_train), y_pred_train)
64
+ results.append({metric_name: train_value})
78
65
  return results
79
66
 
80
- def summary(self, raw_results):
81
- """
82
- Returns a summarized representation of the dataset split information
83
- """
84
- table_records = []
85
- for result in raw_results:
86
- for key, _ in result.items():
87
- table_records.append(
88
- {
89
- "Metric": key,
90
- "TRAIN": result[key]["train"],
91
- "TEST": result[key]["test"],
92
- }
93
- )
94
-
95
- return ResultSummary(results=[ResultTable(data=table_records)])
96
-
97
67
  def metric_info(self):
98
68
  raise NotImplementedError
99
69
 
100
70
  def run(self):
101
- y_true_train = self.inputs.datasets[0].y
102
- class_pred_train = self.inputs.datasets[0].y_pred(self.inputs.model)
71
+ y_true_train = self.inputs.dataset.y
72
+ class_pred_train = self.inputs.dataset.y_pred(self.inputs.model)
103
73
  y_true_train = y_true_train.astype(class_pred_train.dtype)
104
74
 
105
- y_true_test = self.inputs.datasets[1].y
106
- class_pred_test = self.inputs.datasets[1].y_pred(self.inputs.model)
107
- y_true_test = y_true_test.astype(class_pred_test.dtype)
108
-
109
- samples = ["train", "test"]
110
75
  results = self.cluster_performance_metrics(
111
76
  y_true_train,
112
77
  class_pred_train,
113
- y_true_test,
114
- class_pred_test,
115
- samples,
116
78
  self.metric_info(),
117
79
  )
118
80
  return self.cache_results(metric_value=results)
@@ -16,33 +16,33 @@ class ClusterPerformanceMetrics(ClusterPerformance):
16
16
  """
17
17
  Evaluates the performance of clustering machine learning models using multiple established metrics.
18
18
 
19
- **Purpose:**
19
+ ### Purpose
20
20
 
21
21
  The `ClusterPerformanceMetrics` test is used to assess the performance and validity of clustering machine learning
22
22
  models. It evaluates homogeneity, completeness, V measure score, the Adjusted Rand Index, the Adjusted Mutual
23
23
  Information, and the Fowlkes-Mallows score of the model. These metrics provide a holistic understanding of the
24
24
  model's ability to accurately form clusters of the given dataset.
25
25
 
26
- **Test Mechanism:**
26
+ ### Test Mechanism
27
27
 
28
28
  The `ClusterPerformanceMetrics` test runs a clustering ML model over a given dataset and then calculates six
29
29
  metrics using the Scikit-learn metrics computation functions: Homogeneity Score, Completeness Score, V Measure,
30
30
  Adjusted Rand Index (ARI), Adjusted Mutual Information (AMI), and Fowlkes-Mallows Score. It then returns the result
31
31
  as a summary, presenting the metric values for both training and testing datasets.
32
32
 
33
- **Signs of High Risk:**
33
+ ### Signs of High Risk
34
34
 
35
- - Low Homogeneity Score: This indicates that the clusters formed contain a variety of classes, resulting in less
36
- pure clusters.
37
- - Low Completeness Score: This suggests that class instances are scattered across multiple clusters rather than
38
- being gathered in a single cluster.
39
- - Low V Measure: This would report a low overall clustering performance.
40
- - ARI close to 0 or Negative: This implies that clustering results are random or disagree with the true labels.
41
- - AMI close to 0: It means that clustering labels are random compared with the true labels.
35
+ - Low Homogeneity Score: Indicates that the clusters formed contain a variety of classes, resulting in less pure
36
+ clusters.
37
+ - Low Completeness Score: Suggests that class instances are scattered across multiple clusters rather than being
38
+ gathered in a single cluster.
39
+ - Low V Measure: Reports a low overall clustering performance.
40
+ - ARI close to 0 or Negative: Implies that clustering results are random or disagree with the true labels.
41
+ - AMI close to 0: Means that clustering labels are random compared with the true labels.
42
42
  - Low Fowlkes-Mallows score: Signifies less precise and poor clustering performance in terms of precision and
43
43
  recall.
44
44
 
45
- **Strengths:**
45
+ ### Strengths
46
46
 
47
47
  - Provides a comprehensive view of clustering model performance by examining multiple clustering metrics.
48
48
  - Uses established and widely accepted metrics from scikit-learn, providing reliability in the results.
@@ -50,9 +50,9 @@ class ClusterPerformanceMetrics(ClusterPerformance):
50
50
  - Clearly defined and human-readable descriptions of each score make it easy to understand what each score
51
51
  represents.
52
52
 
53
- **Limitations:**
53
+ ### Limitations
54
54
 
55
- - It only applies to clustering models; not suitable for other types of machine learning models.
55
+ - Only applies to clustering models; not suitable for other types of machine learning models.
56
56
  - Does not test for overfitting or underfitting in the clustering model.
57
57
  - All the scores rely on ground truth labels, the absence or inaccuracy of which can lead to misleading results.
58
58
  - Does not consider aspects like computational efficiency of the model or its capability to handle high dimensional
@@ -60,7 +60,7 @@ class ClusterPerformanceMetrics(ClusterPerformance):
60
60
  """
61
61
 
62
62
  name = "homogeneity_score"
63
- required_inputs = ["model", "datasets"]
63
+ required_inputs = ["model", "dataset"]
64
64
  tasks = ["clustering"]
65
65
  tags = ["sklearn", "model_performance"]
66
66
  default_metrics = {
@@ -121,10 +121,8 @@ class ClusterPerformanceMetrics(ClusterPerformance):
121
121
  for key, _ in result.items():
122
122
  table_records.append(
123
123
  {
124
- "Metric": key,
125
124
  "Description": self.default_metrics_desc[key],
126
- "TRAIN": result[key]["train"],
127
- "TEST": result[key]["test"],
125
+ key: result[key],
128
126
  }
129
127
  )
130
128
 
@@ -14,26 +14,32 @@ class CompletenessScore(ClusterPerformance):
14
14
  """
15
15
  Evaluates a clustering model's capacity to categorize instances from a single class into the same cluster.
16
16
 
17
- **Purpose:** The Completeness Score metric is used to assess the performance of clustering models. It measures the
18
- extent to which all the data points that are members of a given class are elements of the same cluster. The aim is
19
- to determine the capability of the model to categorize all instances from a single class into the same cluster.
17
+ ### Purpose
20
18
 
21
- **Test Mechanism:** This test takes three inputs, a model and its associated training and testing datasets. It
22
- invokes the `completeness_score` function from the sklearn library on the labels predicted by the model. High
23
- scores indicate that data points from the same class generally appear in the same cluster, while low scores suggest
24
- the opposite.
19
+ The Completeness Score metric is used to assess the performance of clustering models. It measures the extent to
20
+ which all the data points that are members of a given class are elements of the same cluster. The aim is to
21
+ determine the capability of the model to categorize all instances from a single class into the same cluster.
22
+
23
+ ### Test Mechanism
24
+
25
+ This test takes three inputs, a model and its associated training and testing datasets. It invokes the
26
+ `completeness_score` function from the sklearn library on the labels predicted by the model. High scores indicate
27
+ that data points from the same class generally appear in the same cluster, while low scores suggest the opposite.
28
+
29
+ ### Signs of High Risk
25
30
 
26
- **Signs of High Risk:**
27
31
  - Low completeness score: This suggests that the model struggles to group instances from the same class into one
28
32
  cluster, indicating poor clustering performance.
29
33
 
30
- **Strengths:**
34
+ ### Strengths
35
+
31
36
  - The Completeness Score provides an effective method for assessing the performance of a clustering model,
32
37
  specifically its ability to group class instances together.
33
38
  - This test metric conveniently relies on the capabilities provided by the sklearn library, ensuring consistent and
34
39
  reliable test results.
35
40
 
36
- **Limitations:**
41
+ ### Limitations
42
+
37
43
  - This metric only evaluates a specific aspect of clustering, meaning it may not provide a holistic or complete
38
44
  view of the model's performance.
39
45
  - It cannot assess the effectiveness of the model in differentiating between separate classes, as it is solely
@@ -43,7 +49,7 @@ class CompletenessScore(ClusterPerformance):
43
49
  """
44
50
 
45
51
  name = "homogeneity_score"
46
- required_inputs = ["model", "datasets"]
52
+ required_inputs = ["model", "dataset"]
47
53
  tasks = ["clustering"]
48
54
  tags = [
49
55
  "sklearn",
@@ -17,33 +17,40 @@ class ConfusionMatrix(Metric):
17
17
  Evaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix
18
18
  heatmap.
19
19
 
20
- **Purpose**: The Confusion Matrix tester is designed to assess the performance of a classification Machine Learning
21
- model. This performance is evaluated based on how well the model is able to correctly classify True Positives, True
22
- Negatives, False Positives, and False Negatives - fundamental aspects of model accuracy.
23
-
24
- **Test Mechanism**: The mechanism used involves taking the predicted results (`y_test_predict`) from the
25
- classification model and comparing them against the actual values (`y_test_true`). A confusion matrix is built
26
- using the unique labels extracted from `y_test_true`, employing scikit-learn's metrics. The matrix is then visually
27
- rendered with the help of Plotly's `create_annotated_heatmap` function. A heatmap is created which provides a
28
- two-dimensional graphical representation of the model's performance, showcasing distributions of True Positives
29
- (TP), True Negatives (TN), False Positives (FP), and False Negatives (FN).
30
-
31
- **Signs of High Risk**: Indicators of high risk related to the model include:
20
+ ### Purpose
21
+
22
+ The Confusion Matrix tester is designed to assess the performance of a classification Machine Learning model. This
23
+ performance is evaluated based on how well the model is able to correctly classify True Positives, True Negatives,
24
+ False Positives, and False Negatives - fundamental aspects of model accuracy.
25
+
26
+ ### Test Mechanism
27
+
28
+ The mechanism used involves taking the predicted results (`y_test_predict`) from the classification model and
29
+ comparing them against the actual values (`y_test_true`). A confusion matrix is built using the unique labels
30
+ extracted from `y_test_true`, employing scikit-learn's metrics. The matrix is then visually rendered with the help
31
+ of Plotly's `create_annotated_heatmap` function. A heatmap is created which provides a two-dimensional graphical
32
+ representation of the model's performance, showcasing distributions of True Positives (TP), True Negatives (TN),
33
+ False Positives (FP), and False Negatives (FN).
34
+
35
+ ### Signs of High Risk
36
+
32
37
  - High numbers of False Positives (FP) and False Negatives (FN), depicting that the model is not effectively
33
38
  classifying the values.
34
39
  - Low numbers of True Positives (TP) and True Negatives (TN), implying that the model is struggling with correctly
35
40
  identifying class labels.
36
41
 
37
- **Strengths**: The Confusion Matrix tester brings numerous strengths:
42
+ ### Strengths
43
+
38
44
  - It provides a simplified yet comprehensive visual snapshot of the classification model's predictive performance.
39
45
  - It distinctly brings out True Positives (TP), True Negatives (TN), False Positives (FP), and False Negatives
40
- (FN), thus, making it easier to focus on potential areas of improvement.
46
+ (FN), thus making it easier to focus on potential areas of improvement.
41
47
  - The matrix is beneficial in dealing with multi-class classification problems as it can provide a simple view of
42
48
  complex model performances.
43
49
  - It aids in understanding the different types of errors that the model could potentially make, as it provides
44
50
  in-depth insights into Type-I and Type-II errors.
45
51
 
46
- **Limitations**: Despite its various strengths, the Confusion Matrix tester does exhibit some limitations:
52
+ ### Limitations
53
+
47
54
  - In cases of unbalanced classes, the effectiveness of the confusion matrix might be lessened. It may wrongly
48
55
  interpret the accuracy of a model that is essentially just predicting the majority class.
49
56
  - It does not provide a single unified statistic that could evaluate the overall performance of the model.