validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +80 -119
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/api_client.py +89 -43
  9. validmind/client.py +2 -2
  10. validmind/client_config.py +11 -14
  11. validmind/datasets/credit_risk/__init__.py +1 -0
  12. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  13. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  14. validmind/datasets/regression/fred_timeseries.py +67 -138
  15. validmind/template.py +1 -0
  16. validmind/test_suites/__init__.py +0 -2
  17. validmind/test_suites/statsmodels_timeseries.py +1 -1
  18. validmind/test_suites/summarization.py +0 -1
  19. validmind/test_suites/time_series.py +0 -43
  20. validmind/tests/__types__.py +14 -15
  21. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  22. validmind/tests/data_validation/ADF.py +31 -24
  23. validmind/tests/data_validation/AutoAR.py +9 -9
  24. validmind/tests/data_validation/AutoMA.py +23 -16
  25. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  26. validmind/tests/data_validation/AutoStationarity.py +21 -16
  27. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  28. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
  29. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
  30. validmind/tests/data_validation/ClassImbalance.py +15 -12
  31. validmind/tests/data_validation/DFGLSArch.py +19 -13
  32. validmind/tests/data_validation/DatasetDescription.py +17 -11
  33. validmind/tests/data_validation/DatasetSplit.py +7 -5
  34. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  35. validmind/tests/data_validation/Duplicates.py +33 -25
  36. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  37. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  38. validmind/tests/data_validation/HighCardinality.py +19 -12
  39. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  40. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  41. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  42. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  43. validmind/tests/data_validation/JarqueBera.py +70 -0
  44. validmind/tests/data_validation/KPSS.py +34 -29
  45. validmind/tests/data_validation/LJungBox.py +66 -0
  46. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  47. validmind/tests/data_validation/MissingValues.py +32 -27
  48. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  49. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  50. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  51. validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
  52. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  53. validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
  54. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
  55. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  56. validmind/tests/data_validation/RunsTest.py +72 -0
  57. validmind/tests/data_validation/ScatterPlot.py +63 -78
  58. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  59. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
  60. validmind/tests/data_validation/Skewness.py +35 -37
  61. validmind/tests/data_validation/SpreadPlot.py +35 -35
  62. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  63. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  64. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  65. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  66. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  67. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  68. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  69. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  70. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  71. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  72. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  73. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  74. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  75. validmind/tests/data_validation/UniqueRows.py +11 -6
  76. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  77. validmind/tests/data_validation/WOEBinTable.py +35 -30
  78. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  79. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  80. validmind/tests/data_validation/nlp/Hashtags.py +42 -40
  81. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  82. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  83. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  84. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  85. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  86. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  87. validmind/tests/data_validation/nlp/TextDescription.py +39 -36
  88. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  89. validmind/tests/decorator.py +81 -42
  90. validmind/tests/model_validation/BertScore.py +36 -27
  91. validmind/tests/model_validation/BleuScore.py +25 -19
  92. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  93. validmind/tests/model_validation/ContextualRecall.py +38 -13
  94. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  95. validmind/tests/model_validation/MeteorScore.py +46 -33
  96. validmind/tests/model_validation/ModelMetadata.py +32 -64
  97. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  98. validmind/tests/model_validation/RegardScore.py +30 -14
  99. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  100. validmind/tests/model_validation/RougeScore.py +36 -30
  101. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  102. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  103. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  104. validmind/tests/model_validation/TokenDisparity.py +31 -23
  105. validmind/tests/model_validation/ToxicityScore.py +26 -17
  106. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  107. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  108. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  109. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  110. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  111. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  112. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  113. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  114. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  115. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  116. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  117. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  118. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  119. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  120. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  121. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  122. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  123. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  124. validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
  125. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  126. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  127. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  128. validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
  129. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  130. validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
  131. validmind/tests/model_validation/ragas/utils.py +6 -0
  132. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  133. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  134. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  135. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  136. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  137. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  138. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  139. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  140. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  141. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  142. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  143. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  144. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  145. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  146. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  147. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  148. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  149. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  150. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
  151. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  152. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  153. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  154. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  155. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  156. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  157. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
  158. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  159. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  160. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
  161. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  162. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  163. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  164. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  165. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  166. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  167. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
  168. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  169. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  170. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  171. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
  172. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  173. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  174. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  175. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  176. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  177. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  178. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  179. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  180. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  181. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  182. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  183. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  184. validmind/tests/prompt_validation/Bias.py +14 -11
  185. validmind/tests/prompt_validation/Clarity.py +16 -14
  186. validmind/tests/prompt_validation/Conciseness.py +7 -5
  187. validmind/tests/prompt_validation/Delimitation.py +23 -22
  188. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  189. validmind/tests/prompt_validation/Robustness.py +12 -10
  190. validmind/tests/prompt_validation/Specificity.py +13 -11
  191. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  192. validmind/tests/run.py +68 -23
  193. validmind/unit_metrics/__init__.py +81 -144
  194. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  195. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  196. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  197. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  198. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  199. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  200. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  201. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  202. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  203. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  204. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  205. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  206. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  207. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  208. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  209. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  210. validmind/utils.py +4 -0
  211. validmind/vm_models/dataset/dataset.py +2 -0
  212. validmind/vm_models/figure.py +5 -0
  213. validmind/vm_models/test/metric.py +1 -0
  214. validmind/vm_models/test/result_wrapper.py +143 -158
  215. validmind/vm_models/test/threshold_test.py +1 -0
  216. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
  217. validmind-2.5.18.dist-info/RECORD +324 -0
  218. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  219. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  220. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  221. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  222. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  223. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  224. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  225. validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
  226. validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
  227. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  228. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  229. validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
  230. validmind-2.5.8.dist-info/RECORD +0 -318
  231. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
  232. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
  233. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -24,36 +24,38 @@ class ClassifierPerformance(Metric):
24
24
  Evaluates performance of binary or multiclass classification models using precision, recall, F1-Score, accuracy,
25
25
  and ROC AUC scores.
26
26
 
27
- **Purpose**: The supplied script is designed to evaluate the performance of Machine Learning classification models.
27
+ ### Purpose
28
+
29
+ The Classifier Performance test is designed to evaluate the performance of Machine Learning classification models.
28
30
  It accomplishes this by computing precision, recall, F1-Score, and accuracy, as well as the ROC AUC (Receiver
29
31
  operating characteristic - Area under the curve) scores, thereby providing a comprehensive analytic view of the
30
32
  models' performance. The test is adaptable, handling binary and multiclass models equally effectively.
31
33
 
32
- **Test Mechanism**: The script produces a report that includes precision, recall, F1-Score, and accuracy, by
33
- leveraging the `classification_report` from the scikit-learn's metrics module. For multiclass models, macro and
34
- weighted averages for these scores are also calculated. Additionally, the ROC AUC scores are calculated and
35
- included in the report using the script's unique `multiclass_roc_auc_score` function. The outcome of the test
36
- (report format) differs based on whether the model is binary or multiclass.
34
+ ### Test Mechanism
35
+
36
+ The test produces a report that includes precision, recall, F1-Score, and accuracy, by leveraging the
37
+ `classification_report` from scikit-learn's metrics module. For multiclass models, macro and weighted averages for
38
+ these scores are also calculated. Additionally, the ROC AUC scores are calculated and included in the report using
39
+ the `multiclass_roc_auc_score` function. The outcome of the test (report format) differs based on whether the model
40
+ is binary or multiclass.
41
+
42
+ ### Signs of High Risk
37
43
 
38
- **Signs of High Risk**:
39
44
  - Low values for precision, recall, F1-Score, accuracy, and ROC AUC, indicating poor performance.
40
- - Imbalance in precision and recall scores. Precision highlights correct positive class predictions, while recall
41
- indicates the accurate identification of actual positive cases. Imbalance may indicate flawed model performance.
42
- - A low ROC AUC score, especially scores close to 0.5 or lower, strongly suggests a failing model.
43
-
44
- **Strengths**:
45
- - The script is versatile, capable of assessing both binary and multiclass models.
46
- - It uses a variety of commonly employed performance metrics, offering a comprehensive view of a model's
47
- performance.
48
- - The use of ROC-AUC as a metric aids in determining the most optimal threshold for classification, especially
49
- beneficial when evaluation datasets are unbalanced.
50
-
51
- **Limitations**:
52
- - The test assumes correctly identified labels for binary classification models and raises an exception if the
53
- positive class is not labeled as "1". However, this setup may not align with all practical applications.
54
- - This script is specifically designed for classification models and is not suited to evaluate regression models.
55
- - The metrics computed may provide limited insights in cases where the test dataset does not adequately represent
56
- the data the model will encounter in real-world scenarios.
45
+ - Imbalance in precision and recall scores.
46
+ - A low ROC AUC score, especially scores close to 0.5 or lower, suggesting a failing model.
47
+
48
+ ### Strengths
49
+
50
+ - Versatile, capable of assessing both binary and multiclass models.
51
+ - Utilizes a variety of commonly employed performance metrics, offering a comprehensive view of model performance.
52
+ - The use of ROC-AUC as a metric is beneficial for evaluating unbalanced datasets.
53
+
54
+ ### Limitations
55
+
56
+ - Assumes correctly identified labels for binary classification models.
57
+ - Specifically designed for classification models and not suitable for regression models.
58
+ - May provide limited insights if the test dataset does not represent real-world scenarios adequately.
57
59
  """
58
60
 
59
61
  name = "classifier_performance"
@@ -132,7 +134,7 @@ class ClassifierPerformance(Metric):
132
134
  if len(np.unique(y_true)) > 2:
133
135
  y_pred = self.inputs.dataset.y_pred(self.inputs.model)
134
136
  y_true = y_true.astype(y_pred.dtype)
135
- roc_auc = self.multiclass_roc_auc_score(y_true, y_pred)
137
+ roc_auc = multiclass_roc_auc_score(y_true, y_pred)
136
138
  else:
137
139
  y_prob = self.inputs.dataset.y_prob(self.inputs.model)
138
140
  y_true = y_true.astype(y_prob.dtype).flatten()
@@ -16,19 +16,21 @@ class ClusterCosineSimilarity(Metric):
16
16
  """
17
17
  Measures the intra-cluster similarity of a clustering model using cosine similarity.
18
18
 
19
- **1. Purpose:**
19
+ ### Purpose
20
+
20
21
  The purpose of this metric is to measure how similar the data points within each cluster of a clustering model are.
21
22
  This is done using cosine similarity, which compares the multi-dimensional direction (but not magnitude) of data
22
23
  vectors. From a Model Risk Management perspective, this metric is used to quantitatively validate that clusters
23
24
  formed by a model have high intra-cluster similarity.
24
25
 
25
- **2. Test Mechanism:**
26
+ ### Test Mechanism
27
+
26
28
  This test works by first extracting the true and predicted clusters of the model's training data. Then, it computes
27
29
  the centroid (average data point) of each cluster. Next, it calculates the cosine similarity between each data
28
30
  point within a cluster and its respective centroid. Finally, it outputs the mean cosine similarity of each cluster,
29
31
  highlighting how similar, on average, data points in a cluster are to the cluster's centroid.
30
32
 
31
- **3. Signs of High Risk:**
33
+ ### Signs of High Risk
32
34
 
33
35
  - Low mean cosine similarity for one or more clusters: If the mean cosine similarity is low, the data points within
34
36
  the respective cluster have high variance in their directions. This can be indicative of poor clustering,
@@ -36,7 +38,7 @@ class ClusterCosineSimilarity(Metric):
36
38
  - High disparity between mean cosine similarity values across clusters: If there's a significant difference in mean
37
39
  cosine similarity across different clusters, this could indicate imbalance in how the model forms clusters.
38
40
 
39
- **4. Strengths:**
41
+ ### Strengths
40
42
 
41
43
  - Cosine similarity operates in a multi-dimensional space, making it effective for measuring similarity in high
42
44
  dimensional datasets, typical for many machine learning problems.
@@ -44,7 +46,7 @@ class ClusterCosineSimilarity(Metric):
44
46
  of each vector.
45
47
  - This metric is not dependent on the scale of the variables, making it equally effective on different scales.
46
48
 
47
- **5. Limitations:**
49
+ ### Limitations
48
50
 
49
51
  - Cosine similarity does not consider magnitudes (i.e. lengths) of vectors, only their direction. This means it may
50
52
  overlook instances where clusters have been adequately separated in terms of magnitude.
@@ -4,7 +4,7 @@
4
4
 
5
5
  from dataclasses import dataclass
6
6
 
7
- from validmind.vm_models import Metric, ResultSummary, ResultTable
7
+ from validmind.vm_models import Metric
8
8
 
9
9
 
10
10
  @dataclass
@@ -13,106 +13,68 @@ class ClusterPerformance(Metric):
13
13
  Evaluates and compares a clustering model's performance on training and testing datasets using multiple defined
14
14
  metrics.
15
15
 
16
- **Purpose:** This metric, ClusterPerformance, evaluates the performance of a clustering model on both the training
17
- and testing datasets. It assesses how well the model defines, forms, and distinguishes clusters of data.
18
-
19
- **Test Mechanism:** The metric is applied by first predicting the clusters of the training and testing datasets
20
- using the clustering model. Next, performance metrics, defined in the method `metric_info()`, are calculated
21
- against the true labels of the datasets. The results for each metric for both datasets are then collated and
22
- returned in a summarized table form listing each metric along with its corresponding train and test values.
23
-
24
- **Signs of High Risk:**
25
- - High discrepancy between the performance metric values on the training and testing datasets. This could signify
26
- problems such as overfitting or underfitting.
27
- - Low performance metric values on the training and testing datasets. There might be a problem with the model
28
- itself or the chosen hyperparameters.
29
- - If the model's performance deteriorates consistently across different sets of metrics, this may suggest a broader
30
- issue with the model or the dataset.
31
-
32
- **Strengths:**
33
- - Tests the model's performance on both the training and testing datasets, which helps to identify issues such as
34
- overfitting or underfitting.
35
- - Allows for a broad range of performance metrics to be used, thus providing a comprehensive evaluation of the
36
- model's clustering capabilities.
37
- - Returns a summarized table, which makes it easy to compare the model's performance across different metrics and
38
- datasets.
39
-
40
- **Limitations:**
41
- - The method `metric_info()` needs to be properly overridden in a subclass for this class to be used, and the
42
- metrics to be used must be manually defined.
43
- - The performance metrics are calculated on predicted cluster labels, so the metric may not capture the model's
44
- performance well if the clusters are not well separated or if the model has difficulties with certain kinds of
45
- clusters.
46
- - Doesn't consider the computational and time complexity of the model. While the model may perform well in terms of
47
- the performance metrics, it might be time or resource-intensive. This metric does not account for such scenarios.
48
- - Because the comparison is binary (train and test), it might not capture scenarios where the performance changes
49
- drastically under different circumstances or categories within the dataset.
16
+ ### Purpose
17
+
18
+ The Cluster Performance test evaluates the performance of a clustering model on both the training and testing
19
+ datasets. It assesses how well the model defines, forms, and distinguishes clusters of data.
20
+
21
+ ### Test Mechanism
22
+
23
+ The test mechanism involves predicting the clusters of the training and testing datasets using the clustering
24
+ model. After prediction, performance metrics defined in the `metric_info()` method are calculated against the true
25
+ labels of the datasets. The results for each metric for both datasets are then collated and returned in a
26
+ summarized table form listing each metric along with its corresponding train and test values.
27
+
28
+ ### Signs of High Risk
29
+
30
+ - High discrepancy between the performance metric values on the training and testing datasets.
31
+ - Low performance metric values on both the training and testing datasets.
32
+ - Consistent deterioration of performance across different metrics.
33
+
34
+ ### Strengths
35
+
36
+ - Tests the model's performance on both training and testing datasets, helping to identify overfitting or
37
+ underfitting.
38
+ - Allows for the use of a broad range of performance metrics, providing a comprehensive evaluation.
39
+ - Returns a summarized table, making it easy to compare performance across different metrics and datasets.
40
+
41
+ ### Limitations
42
+
43
+ - The `metric_info()` method needs to be properly overridden in a subclass and metrics must be manually defined.
44
+ - The test may not capture the model's performance well if clusters are not well-separated or the model struggles
45
+ with certain clusters.
46
+ - Does not consider the computational and time complexity of the model.
47
+ - Binary comparison (train and test) might not capture performance changes under different circumstances or dataset
48
+ categories.
50
49
  """
51
50
 
52
51
  name = "cluster_performance_metrics"
53
- required_inputs = ["model", "datasets"]
52
+ required_inputs = ["model", "dataset"]
54
53
  tasks = ["clustering"]
55
54
  tags = [
56
55
  "sklearn",
57
56
  "model_performance",
58
57
  ]
59
58
 
60
- def cluster_performance_metrics(
61
- self, y_true_train, y_pred_train, y_true_test, y_pred_test, samples, metric_info
62
- ):
59
+ def cluster_performance_metrics(self, y_true_train, y_pred_train, metric_info):
63
60
  y_true_train = y_true_train.astype(y_pred_train.dtype).flatten()
64
- y_true_test = y_true_test.astype(y_pred_test.dtype).flatten()
65
61
  results = []
66
62
  for metric_name, metric_fcn in metric_info.items():
67
- for _ in samples:
68
- train_value = metric_fcn(list(y_true_train), y_pred_train)
69
- test_value = metric_fcn(list(y_true_test), y_pred_test)
70
- results.append(
71
- {
72
- metric_name: {
73
- "train": train_value,
74
- "test": test_value,
75
- }
76
- }
77
- )
63
+ train_value = metric_fcn(list(y_true_train), y_pred_train)
64
+ results.append({metric_name: train_value})
78
65
  return results
79
66
 
80
- def summary(self, raw_results):
81
- """
82
- Returns a summarized representation of the dataset split information
83
- """
84
- table_records = []
85
- for result in raw_results:
86
- for key, _ in result.items():
87
- table_records.append(
88
- {
89
- "Metric": key,
90
- "TRAIN": result[key]["train"],
91
- "TEST": result[key]["test"],
92
- }
93
- )
94
-
95
- return ResultSummary(results=[ResultTable(data=table_records)])
96
-
97
67
  def metric_info(self):
98
68
  raise NotImplementedError
99
69
 
100
70
  def run(self):
101
- y_true_train = self.inputs.datasets[0].y
102
- class_pred_train = self.inputs.datasets[0].y_pred(self.inputs.model)
71
+ y_true_train = self.inputs.dataset.y
72
+ class_pred_train = self.inputs.dataset.y_pred(self.inputs.model)
103
73
  y_true_train = y_true_train.astype(class_pred_train.dtype)
104
74
 
105
- y_true_test = self.inputs.datasets[1].y
106
- class_pred_test = self.inputs.datasets[1].y_pred(self.inputs.model)
107
- y_true_test = y_true_test.astype(class_pred_test.dtype)
108
-
109
- samples = ["train", "test"]
110
75
  results = self.cluster_performance_metrics(
111
76
  y_true_train,
112
77
  class_pred_train,
113
- y_true_test,
114
- class_pred_test,
115
- samples,
116
78
  self.metric_info(),
117
79
  )
118
80
  return self.cache_results(metric_value=results)
@@ -16,33 +16,33 @@ class ClusterPerformanceMetrics(ClusterPerformance):
16
16
  """
17
17
  Evaluates the performance of clustering machine learning models using multiple established metrics.
18
18
 
19
- **Purpose:**
19
+ ### Purpose
20
20
 
21
21
  The `ClusterPerformanceMetrics` test is used to assess the performance and validity of clustering machine learning
22
22
  models. It evaluates homogeneity, completeness, V measure score, the Adjusted Rand Index, the Adjusted Mutual
23
23
  Information, and the Fowlkes-Mallows score of the model. These metrics provide a holistic understanding of the
24
24
  model's ability to accurately form clusters of the given dataset.
25
25
 
26
- **Test Mechanism:**
26
+ ### Test Mechanism
27
27
 
28
28
  The `ClusterPerformanceMetrics` test runs a clustering ML model over a given dataset and then calculates six
29
29
  metrics using the Scikit-learn metrics computation functions: Homogeneity Score, Completeness Score, V Measure,
30
30
  Adjusted Rand Index (ARI), Adjusted Mutual Information (AMI), and Fowlkes-Mallows Score. It then returns the result
31
31
  as a summary, presenting the metric values for both training and testing datasets.
32
32
 
33
- **Signs of High Risk:**
33
+ ### Signs of High Risk
34
34
 
35
- - Low Homogeneity Score: This indicates that the clusters formed contain a variety of classes, resulting in less
36
- pure clusters.
37
- - Low Completeness Score: This suggests that class instances are scattered across multiple clusters rather than
38
- being gathered in a single cluster.
39
- - Low V Measure: This would report a low overall clustering performance.
40
- - ARI close to 0 or Negative: This implies that clustering results are random or disagree with the true labels.
41
- - AMI close to 0: It means that clustering labels are random compared with the true labels.
35
+ - Low Homogeneity Score: Indicates that the clusters formed contain a variety of classes, resulting in less pure
36
+ clusters.
37
+ - Low Completeness Score: Suggests that class instances are scattered across multiple clusters rather than being
38
+ gathered in a single cluster.
39
+ - Low V Measure: Reports a low overall clustering performance.
40
+ - ARI close to 0 or Negative: Implies that clustering results are random or disagree with the true labels.
41
+ - AMI close to 0: Means that clustering labels are random compared with the true labels.
42
42
  - Low Fowlkes-Mallows score: Signifies less precise and poor clustering performance in terms of precision and
43
43
  recall.
44
44
 
45
- **Strengths:**
45
+ ### Strengths
46
46
 
47
47
  - Provides a comprehensive view of clustering model performance by examining multiple clustering metrics.
48
48
  - Uses established and widely accepted metrics from scikit-learn, providing reliability in the results.
@@ -50,9 +50,9 @@ class ClusterPerformanceMetrics(ClusterPerformance):
50
50
  - Clearly defined and human-readable descriptions of each score make it easy to understand what each score
51
51
  represents.
52
52
 
53
- **Limitations:**
53
+ ### Limitations
54
54
 
55
- - It only applies to clustering models; not suitable for other types of machine learning models.
55
+ - Only applies to clustering models; not suitable for other types of machine learning models.
56
56
  - Does not test for overfitting or underfitting in the clustering model.
57
57
  - All the scores rely on ground truth labels, the absence or inaccuracy of which can lead to misleading results.
58
58
  - Does not consider aspects like computational efficiency of the model or its capability to handle high dimensional
@@ -60,7 +60,7 @@ class ClusterPerformanceMetrics(ClusterPerformance):
60
60
  """
61
61
 
62
62
  name = "homogeneity_score"
63
- required_inputs = ["model", "datasets"]
63
+ required_inputs = ["model", "dataset"]
64
64
  tasks = ["clustering"]
65
65
  tags = ["sklearn", "model_performance"]
66
66
  default_metrics = {
@@ -121,10 +121,8 @@ class ClusterPerformanceMetrics(ClusterPerformance):
121
121
  for key, _ in result.items():
122
122
  table_records.append(
123
123
  {
124
- "Metric": key,
125
124
  "Description": self.default_metrics_desc[key],
126
- "TRAIN": result[key]["train"],
127
- "TEST": result[key]["test"],
125
+ key: result[key],
128
126
  }
129
127
  )
130
128
 
@@ -14,26 +14,32 @@ class CompletenessScore(ClusterPerformance):
14
14
  """
15
15
  Evaluates a clustering model's capacity to categorize instances from a single class into the same cluster.
16
16
 
17
- **Purpose:** The Completeness Score metric is used to assess the performance of clustering models. It measures the
18
- extent to which all the data points that are members of a given class are elements of the same cluster. The aim is
19
- to determine the capability of the model to categorize all instances from a single class into the same cluster.
17
+ ### Purpose
20
18
 
21
- **Test Mechanism:** This test takes three inputs, a model and its associated training and testing datasets. It
22
- invokes the `completeness_score` function from the sklearn library on the labels predicted by the model. High
23
- scores indicate that data points from the same class generally appear in the same cluster, while low scores suggest
24
- the opposite.
19
+ The Completeness Score metric is used to assess the performance of clustering models. It measures the extent to
20
+ which all the data points that are members of a given class are elements of the same cluster. The aim is to
21
+ determine the capability of the model to categorize all instances from a single class into the same cluster.
22
+
23
+ ### Test Mechanism
24
+
25
+ This test takes three inputs, a model and its associated training and testing datasets. It invokes the
26
+ `completeness_score` function from the sklearn library on the labels predicted by the model. High scores indicate
27
+ that data points from the same class generally appear in the same cluster, while low scores suggest the opposite.
28
+
29
+ ### Signs of High Risk
25
30
 
26
- **Signs of High Risk:**
27
31
  - Low completeness score: This suggests that the model struggles to group instances from the same class into one
28
32
  cluster, indicating poor clustering performance.
29
33
 
30
- **Strengths:**
34
+ ### Strengths
35
+
31
36
  - The Completeness Score provides an effective method for assessing the performance of a clustering model,
32
37
  specifically its ability to group class instances together.
33
38
  - This test metric conveniently relies on the capabilities provided by the sklearn library, ensuring consistent and
34
39
  reliable test results.
35
40
 
36
- **Limitations:**
41
+ ### Limitations
42
+
37
43
  - This metric only evaluates a specific aspect of clustering, meaning it may not provide a holistic or complete
38
44
  view of the model's performance.
39
45
  - It cannot assess the effectiveness of the model in differentiating between separate classes, as it is solely
@@ -43,7 +49,7 @@ class CompletenessScore(ClusterPerformance):
43
49
  """
44
50
 
45
51
  name = "homogeneity_score"
46
- required_inputs = ["model", "datasets"]
52
+ required_inputs = ["model", "dataset"]
47
53
  tasks = ["clustering"]
48
54
  tags = [
49
55
  "sklearn",
@@ -17,33 +17,40 @@ class ConfusionMatrix(Metric):
17
17
  Evaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix
18
18
  heatmap.
19
19
 
20
- **Purpose**: The Confusion Matrix tester is designed to assess the performance of a classification Machine Learning
21
- model. This performance is evaluated based on how well the model is able to correctly classify True Positives, True
22
- Negatives, False Positives, and False Negatives - fundamental aspects of model accuracy.
23
-
24
- **Test Mechanism**: The mechanism used involves taking the predicted results (`y_test_predict`) from the
25
- classification model and comparing them against the actual values (`y_test_true`). A confusion matrix is built
26
- using the unique labels extracted from `y_test_true`, employing scikit-learn's metrics. The matrix is then visually
27
- rendered with the help of Plotly's `create_annotated_heatmap` function. A heatmap is created which provides a
28
- two-dimensional graphical representation of the model's performance, showcasing distributions of True Positives
29
- (TP), True Negatives (TN), False Positives (FP), and False Negatives (FN).
30
-
31
- **Signs of High Risk**: Indicators of high risk related to the model include:
20
+ ### Purpose
21
+
22
+ The Confusion Matrix tester is designed to assess the performance of a classification Machine Learning model. This
23
+ performance is evaluated based on how well the model is able to correctly classify True Positives, True Negatives,
24
+ False Positives, and False Negatives - fundamental aspects of model accuracy.
25
+
26
+ ### Test Mechanism
27
+
28
+ The mechanism used involves taking the predicted results (`y_test_predict`) from the classification model and
29
+ comparing them against the actual values (`y_test_true`). A confusion matrix is built using the unique labels
30
+ extracted from `y_test_true`, employing scikit-learn's metrics. The matrix is then visually rendered with the help
31
+ of Plotly's `create_annotated_heatmap` function. A heatmap is created which provides a two-dimensional graphical
32
+ representation of the model's performance, showcasing distributions of True Positives (TP), True Negatives (TN),
33
+ False Positives (FP), and False Negatives (FN).
34
+
35
+ ### Signs of High Risk
36
+
32
37
  - High numbers of False Positives (FP) and False Negatives (FN), depicting that the model is not effectively
33
38
  classifying the values.
34
39
  - Low numbers of True Positives (TP) and True Negatives (TN), implying that the model is struggling with correctly
35
40
  identifying class labels.
36
41
 
37
- **Strengths**: The Confusion Matrix tester brings numerous strengths:
42
+ ### Strengths
43
+
38
44
  - It provides a simplified yet comprehensive visual snapshot of the classification model's predictive performance.
39
45
  - It distinctly brings out True Positives (TP), True Negatives (TN), False Positives (FP), and False Negatives
40
- (FN), thus, making it easier to focus on potential areas of improvement.
46
+ (FN), thus making it easier to focus on potential areas of improvement.
41
47
  - The matrix is beneficial in dealing with multi-class classification problems as it can provide a simple view of
42
48
  complex model performances.
43
49
  - It aids in understanding the different types of errors that the model could potentially make, as it provides
44
50
  in-depth insights into Type-I and Type-II errors.
45
51
 
46
- **Limitations**: Despite its various strengths, the Confusion Matrix tester does exhibit some limitations:
52
+ ### Limitations
53
+
47
54
  - In cases of unbalanced classes, the effectiveness of the confusion matrix might be lessened. It may wrongly
48
55
  interpret the accuracy of a model that is essentially just predicting the majority class.
49
56
  - It does not provide a single unified statistic that could evaluate the overall performance of the model.
@@ -0,0 +1,95 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import pandas as pd
6
+ from sklearn.inspection import permutation_importance
7
+
8
+ from validmind import tags, tasks
9
+
10
+
11
+ @tags("model_explainability", "sklearn")
12
+ @tasks("regression", "time_series_forecasting")
13
+ def FeatureImportance(dataset, model, num_features=3):
14
+ """
15
+ Compute feature importance scores for a given model and generate a summary table
16
+ with the top important features.
17
+
18
+ ### Purpose
19
+
20
+ The Feature Importance Comparison test is designed to compare the feature importance scores for different models
21
+ when applied to various datasets. By doing so, it aims to identify the most impactful features and assess the
22
+ consistency of feature importance across models.
23
+
24
+ ### Test Mechanism
25
+
26
+ This test works by iterating through each dataset-model pair and calculating permutation feature importance (PFI)
27
+ scores. It then generates a summary table containing the top `num_features` important features for each model. The
28
+ process involves:
29
+
30
+ - Extracting features and target data from each dataset.
31
+ - Computing PFI scores using `sklearn.inspection.permutation_importance`.
32
+ - Sorting and selecting the top features based on their importance scores.
33
+ - Compiling these features into a summary table for comparison.
34
+
35
+ ### Signs of High Risk
36
+
37
+ - Key features expected to be important are ranked low, indicating potential issues with model training or data
38
+ quality.
39
+ - High variance in feature importance scores across different models, suggesting instability in feature selection.
40
+
41
+ ### Strengths
42
+
43
+ - Provides a clear comparison of the most important features for each model.
44
+ - Uses permutation importance, which is a model-agnostic method and can be applied to any estimator.
45
+
46
+ ### Limitations
47
+
48
+ - Assumes that the dataset is provided as a DataFrameDataset object with `x_df` and `y_df` methods to access
49
+ feature and target data.
50
+ - Requires that `model.model` is compatible with `sklearn.inspection.permutation_importance`.
51
+ - The function's output is dependent on the number of features specified by `num_features`, which defaults to 3 but
52
+ can be adjusted.
53
+ """
54
+ results_list = []
55
+
56
+ x = dataset.x_df()
57
+ y = dataset.y_df()
58
+
59
+ pfi_values = permutation_importance(
60
+ model.model,
61
+ x,
62
+ y,
63
+ random_state=0,
64
+ n_jobs=-2,
65
+ )
66
+
67
+ # Create a dictionary to store PFI scores
68
+ pfi = {
69
+ column: pfi_values["importances_mean"][i] for i, column in enumerate(x.columns)
70
+ }
71
+
72
+ # Sort features by their importance
73
+ sorted_features = sorted(pfi.items(), key=lambda item: item[1], reverse=True)
74
+
75
+ # Extract the top `num_features` features
76
+ top_features = sorted_features[:num_features]
77
+
78
+ # Prepare the result for the current model and dataset
79
+ result = {}
80
+
81
+ # Dynamically add feature columns to the result
82
+ for i in range(num_features):
83
+ if i < len(top_features):
84
+ result[
85
+ f"Feature {i + 1}"
86
+ ] = f"[{top_features[i][0]}; {top_features[i][1]:.4f}]"
87
+ else:
88
+ result[f"Feature {i + 1}"] = None
89
+
90
+ # Append the result to the list
91
+ results_list.append(result)
92
+
93
+ # Convert the results list to a DataFrame
94
+ results_df = pd.DataFrame(results_list)
95
+ return results_df
@@ -15,27 +15,27 @@ class FowlkesMallowsScore(ClusterPerformance):
15
15
  Evaluates the similarity between predicted and actual cluster assignments in a model using the Fowlkes-Mallows
16
16
  score.
17
17
 
18
- **Purpose:**
18
+ ### Purpose
19
19
 
20
20
  The FowlkesMallowsScore is a performance metric used to validate clustering algorithms within machine learning
21
21
  models. The score intends to evaluate the matching grade between two clusters. It measures the similarity between
22
22
  the predicted and actual cluster assignments, thus gauging the accuracy of the model's clustering capability.
23
23
 
24
- **Test Mechanism:**
24
+ ### Test Mechanism
25
25
 
26
26
  The FowlkesMallowsScore method applies the `fowlkes_mallows_score` function from the `sklearn` library to evaluate
27
27
  the model's accuracy in clustering different types of data. The test fetches the datasets from the model's training
28
28
  and testing datasets as inputs then compares the resulting clusters against the previously known clusters to obtain
29
29
  a score. A high score indicates a better clustering performance by the model.
30
30
 
31
- **Signs of High Risk:**
31
+ ### Signs of High Risk
32
32
 
33
33
  - A low Fowlkes-Mallows score (near zero): This indicates that the model's clustering capability is poor and the
34
34
  algorithm isn't properly grouping data.
35
- - Inconsistently low scores across different datasets: this may indicate that the model's clustering performance is
35
+ - Inconsistently low scores across different datasets: This may indicate that the model's clustering performance is
36
36
  not robust and the model may fail when applied to unseen data.
37
37
 
38
- **Strengths:**
38
+ ### Strengths
39
39
 
40
40
  - The Fowlkes-Mallows score is a simple and effective method for evaluating the performance of clustering
41
41
  algorithms.
@@ -43,7 +43,7 @@ class FowlkesMallowsScore(ClusterPerformance):
43
43
  comprehensive measure of model performance.
44
44
  - The Fowlkes-Mallows score is non-biased meaning it treats False Positives and False Negatives equally.
45
45
 
46
- **Limitations:**
46
+ ### Limitations
47
47
 
48
48
  - As a pairwise-based method, this score can be computationally intensive for large datasets and can become
49
49
  unfeasible as the size of the dataset increases.
@@ -54,7 +54,7 @@ class FowlkesMallowsScore(ClusterPerformance):
54
54
  """
55
55
 
56
56
  name = "fowlkes_mallows_score"
57
- required_inputs = ["model", "datasets"]
57
+ required_inputs = ["model", "dataset"]
58
58
  tasks = ["clustering"]
59
59
  tags = [
60
60
  "sklearn",