validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.8.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -2,15 +2,15 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
5
+ from sklearn.metrics import adjusted_rand_score
6
6
 
7
- from sklearn import metrics
7
+ from validmind import tags, tasks
8
+ from validmind.vm_models import VMDataset, VMModel
8
9
 
9
- from .ClusterPerformance import ClusterPerformance
10
10
 
11
-
12
- @dataclass
13
- class AdjustedRandIndex(ClusterPerformance):
11
+ @tags("sklearn", "model_performance", "clustering")
12
+ @tasks("clustering")
13
+ def AdjustedRandIndex(model: VMModel, dataset: VMDataset):
14
14
  """
15
15
  Measures the similarity between two data clusters using the Adjusted Rand Index (ARI) metric in clustering machine
16
16
  learning models.
@@ -49,14 +49,11 @@ class AdjustedRandIndex(ClusterPerformance):
49
49
  - It may be difficult to interpret the implications of an ARI score without context or a benchmark, as it is
50
50
  heavily dependent on the characteristics of the dataset used.
51
51
  """
52
-
53
- name = "adjusted_rand_index"
54
- required_inputs = ["model", "dataset"]
55
- tasks = ["clustering"]
56
- tags = [
57
- "sklearn",
58
- "model_performance",
52
+ return [
53
+ {
54
+ "Adjusted Rand Index": adjusted_rand_score(
55
+ labels_true=dataset.y,
56
+ labels_pred=dataset.y_pred(model),
57
+ )
58
+ }
59
59
  ]
60
-
61
- def metric_info(self):
62
- return {"Adjusted Rand Index": metrics.adjusted_rand_score}
@@ -2,24 +2,25 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import numpy as np
8
6
  from sklearn.metrics import classification_report, roc_auc_score
9
7
  from sklearn.preprocessing import LabelBinarizer
10
8
 
11
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
9
+ from validmind import tags, tasks
10
+ from validmind.vm_models import VMDataset, VMModel
12
11
 
13
12
 
14
13
  def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
15
14
  lb = LabelBinarizer()
16
15
  lb.fit(y_test)
17
-
18
16
  return roc_auc_score(lb.transform(y_test), lb.transform(y_pred), average=average)
19
17
 
20
18
 
21
- @dataclass
22
- class ClassifierPerformance(Metric):
19
+ @tags(
20
+ "sklearn", "binary_classification", "multiclass_classification", "model_performance"
21
+ )
22
+ @tasks("classification", "text_classification")
23
+ def ClassifierPerformance(dataset: VMDataset, model: VMModel, average: str = "macro"):
23
24
  """
24
25
  Evaluates performance of binary or multiclass classification models using precision, recall, F1-Score, accuracy,
25
26
  and ROC AUC scores.
@@ -57,92 +58,53 @@ class ClassifierPerformance(Metric):
57
58
  - Specifically designed for classification models and not suitable for regression models.
58
59
  - May provide limited insights if the test dataset does not represent real-world scenarios adequately.
59
60
  """
60
-
61
- name = "classifier_performance"
62
- required_inputs = ["model", "dataset"]
63
- tasks = ["classification", "text_classification"]
64
- tags = [
65
- "sklearn",
66
- "binary_classification",
67
- "multiclass_classification",
68
- "model_performance",
61
+ y_pred = dataset.y_pred(model)
62
+ y_true = dataset.y
63
+
64
+ labels = np.unique(y_true)
65
+ labels = sorted(labels.tolist())
66
+
67
+ report = classification_report(
68
+ y_true=y_true,
69
+ y_pred=y_pred,
70
+ output_dict=True,
71
+ zero_division=0,
72
+ )
73
+
74
+ if len(labels) > 2:
75
+ y_true = y_true.astype(y_pred.dtype)
76
+ roc_auc = multiclass_roc_auc_score(y_true, y_pred, average=average)
77
+ else:
78
+ y_prob = dataset.y_prob(model)
79
+ y_true = y_true.astype(y_prob.dtype).flatten()
80
+ roc_auc = roc_auc_score(y_true, y_prob, average=average)
81
+
82
+ report["roc_auc"] = roc_auc
83
+
84
+ pr_f1_table = [
85
+ {
86
+ "Class": f"{class_name}",
87
+ "Precision": report[f"{class_name}"]["precision"],
88
+ "Recall": report[f"{class_name}"]["recall"],
89
+ "F1": report[f"{class_name}"]["f1-score"],
90
+ }
91
+ for class_name in labels
69
92
  ]
70
- default_params = {"average": "macro"}
71
-
72
- def summary(self, metric_value: dict):
73
- """
74
- When building a multi-class summary we need to calculate weighted average,
75
- macro average and per class metrics.
76
- """
77
- classes = {str(i) for i in np.unique(self.inputs.dataset.y)}
78
- pr_f1_table = [
79
- {
80
- "Class": class_name,
81
- "Precision": metric_value[class_name]["precision"],
82
- "Recall": metric_value[class_name]["recall"],
83
- "F1": metric_value[class_name]["f1-score"],
84
- }
85
- for class_name in classes
86
- ]
87
- pr_f1_table.extend(
88
- [
89
- {
90
- "Class": "Weighted Average",
91
- "Precision": metric_value["weighted avg"]["precision"],
92
- "Recall": metric_value["weighted avg"]["recall"],
93
- "F1": metric_value["weighted avg"]["f1-score"],
94
- },
95
- {
96
- "Class": "Macro Average",
97
- "Precision": metric_value["macro avg"]["precision"],
98
- "Recall": metric_value["macro avg"]["recall"],
99
- "F1": metric_value["macro avg"]["f1-score"],
100
- },
101
- ]
102
- )
103
93
 
104
- acc_roc_auc_table = [
94
+ for avg in ["weighted avg", "macro avg"]:
95
+ pr_f1_table.append(
105
96
  {
106
- "Metric": "Accuracy" if metric_name == "accuracy" else "ROC AUC",
107
- "Value": metric_value[metric_name],
97
+ "Class": avg.replace("avg", "Average").title(),
98
+ "Precision": report[avg]["precision"],
99
+ "Recall": report[avg]["recall"],
100
+ "F1": report[avg]["f1-score"],
108
101
  }
109
- for metric_name in ["accuracy", "roc_auc"]
110
- ]
111
-
112
- return ResultSummary(
113
- results=[
114
- ResultTable(
115
- data=pr_f1_table,
116
- metadata=ResultTableMetadata(title="Precision, Recall, and F1"),
117
- ),
118
- ResultTable(
119
- data=acc_roc_auc_table,
120
- metadata=ResultTableMetadata(title="Accuracy and ROC AUC"),
121
- ),
122
- ]
123
102
  )
124
103
 
125
- def run(self):
126
- report = classification_report(
127
- self.inputs.dataset.y,
128
- self.inputs.dataset.y_pred(self.inputs.model),
129
- output_dict=True,
130
- zero_division=0,
131
- )
132
-
133
- y_true = self.inputs.dataset.y
134
-
135
- if len(np.unique(y_true)) > 2:
136
- y_pred = self.inputs.dataset.y_pred(self.inputs.model)
137
- y_true = y_true.astype(y_pred.dtype)
138
- roc_auc = multiclass_roc_auc_score(
139
- y_true, y_pred, average=self.params["average"]
140
- )
141
- else:
142
- y_prob = self.inputs.dataset.y_prob(self.inputs.model)
143
- y_true = y_true.astype(y_prob.dtype).flatten()
144
- roc_auc = roc_auc_score(y_true, y_prob, average=self.params["average"])
145
-
146
- report["roc_auc"] = roc_auc
147
-
148
- return self.cache_results(report)
104
+ return {
105
+ "Precision, Recall, and F1": pr_f1_table,
106
+ "Accuracy and ROC AUC": [
107
+ {"Metric": m, "Value": report[k]}
108
+ for m, k in [("Accuracy", "accuracy"), ("ROC AUC", "roc_auc")]
109
+ ],
110
+ }
@@ -2,17 +2,17 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import numpy as np
8
- import pandas as pd
9
6
  from sklearn.metrics.pairwise import cosine_similarity
10
7
 
11
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
8
+ from validmind import tags, tasks
9
+ from validmind.errors import SkipTestError
10
+ from validmind.vm_models import VMDataset, VMModel
12
11
 
13
12
 
14
- @dataclass
15
- class ClusterCosineSimilarity(Metric):
13
+ @tags("sklearn", "model_performance", "clustering")
14
+ @tasks("clustering")
15
+ def ClusterCosineSimilarity(model: VMModel, dataset: VMDataset):
16
16
  """
17
17
  Measures the intra-cluster similarity of a clustering model using cosine similarity.
18
18
 
@@ -56,59 +56,29 @@ class ClusterCosineSimilarity(Metric):
56
56
  - Lastly, although rare, perfect perpendicular vectors (cosine similarity = 0) could be within the same cluster,
57
57
  which may give an inaccurate representation of a 'bad' cluster due to low cosine similarity score.
58
58
  """
59
-
60
- name = "cluster_cosine_similarity"
61
- required_inputs = ["model", "dataset"]
62
- tasks = ["clustering"]
63
- tags = [
64
- "sklearn",
65
- "model_performance",
66
- ]
67
-
68
- def run(self):
69
- y_true_train = self.inputs.dataset.y
70
- y_pred_train = self.inputs.dataset.y_pred(self.inputs.model)
71
- y_true_train = y_true_train.astype(y_pred_train.dtype).flatten()
72
- num_clusters = len(np.unique(y_pred_train))
73
- # Calculate cosine similarity for each cluster
74
- results = []
75
- for cluster_id in range(num_clusters):
76
- cluster_mask = y_pred_train == cluster_id
77
- cluster_data = self.inputs.dataset.x[cluster_mask]
78
- if cluster_data.size != 0:
79
- # Compute the centroid of the cluster
80
- cluster_centroid = np.mean(cluster_data, axis=0)
81
- # Compute cosine similarities between the centroid and data points in the cluster
82
- cosine_similarities = cosine_similarity(
83
- cluster_data, [cluster_centroid]
84
- )
85
- # Extract cosine similarity values for each data point in the cluster
86
- cosine_similarities = cosine_similarities.flatten()
87
- results.append(
88
- {
89
- "Cluster": cluster_id,
90
- "Mean Cosine Similarity": np.mean(cosine_similarities),
91
- }
92
- )
93
- return self.cache_results(
94
- {
95
- "cosine_similarity": pd.DataFrame(results).to_dict(orient="records"),
96
- }
97
- )
98
-
99
- def summary(self, metric_value):
100
- """
101
- Build one table for summarizing the cluster cosine similarity results
102
- """
103
- summary_regression = metric_value["cosine_similarity"]
104
-
105
- return ResultSummary(
106
- results=[
107
- ResultTable(
108
- data=summary_regression,
109
- metadata=ResultTableMetadata(
110
- title="Cluster Cosine Similarity Results"
59
+ y_pred = dataset.y_pred(model)
60
+ num_clusters = len(np.unique(y_pred))
61
+
62
+ table = []
63
+
64
+ for cluster_idx in range(num_clusters):
65
+ cluster_data = dataset.x[y_pred == cluster_idx]
66
+
67
+ if cluster_data.size != 0:
68
+ cluster_centroid = np.mean(cluster_data, axis=0)
69
+ table.append(
70
+ {
71
+ "Cluster": cluster_idx,
72
+ "Mean Cosine Similarity": np.mean(
73
+ cosine_similarity(
74
+ X=cluster_data,
75
+ Y=[cluster_centroid],
76
+ ).flatten()
111
77
  ),
112
- ),
113
- ]
114
- )
78
+ }
79
+ )
80
+
81
+ if not table:
82
+ raise SkipTestError("No clusters found")
83
+
84
+ return table
@@ -2,17 +2,74 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
- from sklearn import metrics
8
-
9
- from validmind.vm_models import ResultSummary, ResultTable
10
-
11
- from .ClusterPerformance import ClusterPerformance
12
-
13
-
14
- @dataclass
15
- class ClusterPerformanceMetrics(ClusterPerformance):
5
+ from sklearn.metrics import (
6
+ adjusted_mutual_info_score,
7
+ adjusted_rand_score,
8
+ completeness_score,
9
+ fowlkes_mallows_score,
10
+ homogeneity_score,
11
+ v_measure_score,
12
+ )
13
+
14
+ from validmind import tags, tasks
15
+ from validmind.vm_models import VMDataset, VMModel
16
+
17
+ HOMOGENEITY = """
18
+ The homogeneity score is a clustering evaluation metric that quantifies the degree to which each cluster within a
19
+ clustering solution contains only data points that belong to a single true class or category. It provides a score
20
+ within the range of 0 to 1, where a higher homogeneity score indicates that the clusters are more pure and internally
21
+ consistent with respect to the ground truth labels, meaning that the data points within each cluster are closely related
22
+ in terms of their actual class membership.
23
+ """
24
+
25
+ COMPLETENESS = """
26
+ The completeness score is a clustering evaluation metric used to assess how well a clustering solution captures all data points
27
+ that belong to a single true class or category. It quantifies the extent to which the data points of a given class are
28
+ grouped into a single cluster. The completeness score ranges from 0 to 1, with a higher score indicating that the clustering
29
+ solution effectively accounts for all data points within their actual class, emphasizing the comprehensiveness of the
30
+ clustering results with respect to the ground truth labels.
31
+ """
32
+
33
+ V_MEASURE = """
34
+ The V-Measure score is a clustering evaluation metric that combines both homogeneity and completeness to provide a
35
+ single measure of the overall quality of a clustering solution. It takes into account how well clusters are internally
36
+ coherent (homogeneity) and how well they capture all data points from the true classes (completeness). The V-Measure
37
+ score ranges from 0 to 1, where a higher score indicates a better clustering result. It balances the trade-off between
38
+ cluster purity and the extent to which all data points from true classes are captured, offering a comprehensive evaluation
39
+ of the clustering performance.
40
+ """
41
+ ADJUSTED_RAND_INDEX = """
42
+ The Adjusted Rand Index (ARI) is a clustering evaluation metric used to measure the
43
+ similarity between the cluster assignments in a clustering solution and the true class labels. It calculates a
44
+ score that ranges from -1 to 1, with a higher score indicating a better clustering result. A score of 1 signifies
45
+ perfect agreement between the clustering and the ground truth, while a score near 0 implies that the clustering
46
+ is random with respect to the true labels, and negative values indicate disagreement. ARI accounts for chance
47
+ clustering, making it a robust measure for assessing the quality of clustering solutions by considering both the
48
+ extent of agreement and potential randomness in the assignments.
49
+ """
50
+
51
+ ADJUSTED_MUTUAL_INFORMATION = """
52
+ The Adjusted Mutual Information (AMI) is a clustering evaluation metric used to quantify the degree of
53
+ agreement between a clustering solution and the true class labels. It provides a score that ranges from 0 to 1,
54
+ with a higher score indicating a better clustering result. A score of 1 signifies perfect agreement,
55
+ while a score of 0 suggests that the clustering is random with respect to the true labels. AMI takes into account the
56
+ potential randomness in the assignments and adjusts for chance, making it a robust measure that considers both the
57
+ extent of agreement and the potential for random clustering.
58
+ """
59
+
60
+ FOULKES_MALLOWS_SCORE = """
61
+ The Fowlkes-Mallows score is a clustering evaluation metric used to assess the quality of
62
+ a clustering solution by measuring the geometric mean of two fundamental clustering metrics: precision and recall. It
63
+ provides a score that ranges from 0 to 1, where a higher score indicates a better clustering result. A score of 1 signifies
64
+ perfect agreement with the true class labels, while lower scores suggest less precise and recall clustering performance.
65
+ The Fowlkes-Mallows score offers a balanced evaluation of clustering quality by considering both the ability to correctly
66
+ identify members of the same class (precision) and the ability to capture all members of the same class (recall).
67
+ """
68
+
69
+
70
+ @tags("sklearn", "model_performance", "clustering")
71
+ @tasks("clustering")
72
+ def ClusterPerformanceMetrics(model: VMModel, dataset: VMDataset):
16
73
  """
17
74
  Evaluates the performance of clustering machine learning models using multiple established metrics.
18
75
 
@@ -58,75 +115,53 @@ class ClusterPerformanceMetrics(ClusterPerformance):
58
115
  - Does not consider aspects like computational efficiency of the model or its capability to handle high dimensional
59
116
  data.
60
117
  """
61
-
62
- name = "homogeneity_score"
63
- required_inputs = ["model", "dataset"]
64
- tasks = ["clustering"]
65
- tags = ["sklearn", "model_performance"]
66
- default_metrics = {
67
- "Homogeneity Score": metrics.homogeneity_score,
68
- "Completeness Score": metrics.completeness_score,
69
- "V Measure": metrics.v_measure_score,
70
- "Adjusted Rand Index": metrics.adjusted_rand_score,
71
- "Adjusted Mutual Information": metrics.adjusted_mutual_info_score,
72
- "Fowlkes-Mallows score": metrics.fowlkes_mallows_score,
73
- }
74
- default_metrics_desc = {
75
- "Homogeneity Score": """The homogeneity score is a clustering evaluation metric that quantifies
76
- the degree to which each cluster within a clustering solution contains only data points that belong
77
- to a single true class or category. It provides a score within the range of 0 to 1, where a higher
78
- homogeneity score indicates that the clusters are more pure and internally consistent with respect
79
- to the ground truth labels, meaning that the data points within each cluster are closely related in
80
- terms of their actual class membership.
81
- """,
82
- "Completeness Score": """The completeness score is a clustering evaluation metric used to assess how
83
- well a clustering solution captures all data points that belong to a single true class or category.
84
- It quantifies the extent to which the data points of a given class are grouped into a single cluster.
85
- The completeness score ranges from 0 to 1, with a higher score indicating that the clustering solution
86
- effectively accounts for all data points within their actual class, emphasizing the comprehensiveness of
87
- the clustering results with respect to the ground truth labels.""",
88
- "V Measure": """The V-Measure score is a clustering evaluation metric that combines both homogeneity and
89
- completeness to provide a single measure of the overall quality of a clustering solution. It takes into
90
- account how well clusters are internally coherent (homogeneity) and how well they capture all data points
91
- from the true classes (completeness). The V-Measure score ranges from 0 to 1, where a higher score indicates
92
- a better clustering result. It balances the trade-off between cluster purity and the extent to which all data
93
- points from true classes are captured, offering a comprehensive evaluation of the clustering performance.""",
94
- "Adjusted Rand Index": """The Adjusted Rand Index (ARI) is a clustering evaluation metric used to measure the
95
- similarity between the cluster assignments in a clustering solution and the true class labels. It calculates a
96
- score that ranges from -1 to 1, with a higher score indicating a better clustering result. A score of 1 signifies
97
- perfect agreement between the clustering and the ground truth, while a score near 0 implies that the clustering
98
- is random with respect to the true labels, and negative values indicate disagreement. ARI accounts for chance
99
- clustering, making it a robust measure for assessing the quality of clustering solutions by considering both the
100
- extent of agreement and potential randomness in the assignments.""",
101
- "Adjusted Mutual Information": """The Adjusted Mutual Information (AMI) is a clustering evaluation metric used to
102
- quantify the degree of agreement between a clustering solution and the true class labels. It provides a score that
103
- ranges from 0 to 1, with a higher score indicating a better clustering result. A score of 1 signifies perfect agreement,
104
- while a score of 0 suggests that the clustering is random with respect to the true labels. AMI takes into account the
105
- potential randomness in the assignments and adjusts for chance, making it a robust measure that considers both the
106
- extent of agreement and the potential for random clustering.""",
107
- "Fowlkes-Mallows score": """The Fowlkes-Mallows score is a clustering evaluation metric used to assess the quality of
108
- a clustering solution by measuring the geometric mean of two fundamental clustering metrics: precision and recall. It
109
- provides a score that ranges from 0 to 1, where a higher score indicates a better clustering result. A score of 1 signifies
110
- perfect agreement with the true class labels, while lower scores suggest less precise and recall clustering performance.
111
- The Fowlkes-Mallows score offers a balanced evaluation of clustering quality by considering both the ability to correctly
112
- identify members of the same class (precision) and the ability to capture all members of the same class (recall).""",
113
- }
114
-
115
- def summary(self, raw_results):
116
- """
117
- Returns a summarized representation of the dataset split information
118
- """
119
- table_records = []
120
- for result in raw_results:
121
- for key, _ in result.items():
122
- table_records.append(
123
- {
124
- "Description": self.default_metrics_desc[key],
125
- key: result[key],
126
- }
127
- )
128
-
129
- return ResultSummary(results=[ResultTable(data=table_records)])
130
-
131
- def metric_info(self):
132
- return self.default_metrics
118
+ return [
119
+ {
120
+ "Metric": "Homogeneity Score",
121
+ "Description": HOMOGENEITY,
122
+ "Value": homogeneity_score(
123
+ labels_true=dataset.y,
124
+ labels_pred=dataset.y_pred(model),
125
+ ),
126
+ },
127
+ {
128
+ "Metric": "Completeness Score",
129
+ "Description": COMPLETENESS,
130
+ "Value": completeness_score(
131
+ labels_true=dataset.y,
132
+ labels_pred=dataset.y_pred(model),
133
+ ),
134
+ },
135
+ {
136
+ "Metric": "V Measure",
137
+ "Description": V_MEASURE,
138
+ "Value": v_measure_score(
139
+ labels_true=dataset.y,
140
+ labels_pred=dataset.y_pred(model),
141
+ ),
142
+ },
143
+ {
144
+ "Metric": "Adjusted Rand Index",
145
+ "Description": ADJUSTED_RAND_INDEX,
146
+ "Value": adjusted_rand_score(
147
+ labels_true=dataset.y,
148
+ labels_pred=dataset.y_pred(model),
149
+ ),
150
+ },
151
+ {
152
+ "Metric": "Adjusted Mutual Information",
153
+ "Description": ADJUSTED_MUTUAL_INFORMATION,
154
+ "Value": adjusted_mutual_info_score(
155
+ labels_true=dataset.y,
156
+ labels_pred=dataset.y_pred(model),
157
+ ),
158
+ },
159
+ {
160
+ "Metric": "Fowlkes-Mallows score",
161
+ "Description": FOULKES_MALLOWS_SCORE,
162
+ "Value": fowlkes_mallows_score(
163
+ labels_true=dataset.y,
164
+ labels_pred=dataset.y_pred(model),
165
+ ),
166
+ },
167
+ ]
@@ -2,15 +2,15 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
5
+ from sklearn.metrics import completeness_score
6
6
 
7
- from sklearn import metrics
7
+ from validmind import tags, tasks
8
+ from validmind.vm_models import VMDataset, VMModel
8
9
 
9
- from .ClusterPerformance import ClusterPerformance
10
10
 
11
-
12
- @dataclass
13
- class CompletenessScore(ClusterPerformance):
11
+ @tags("sklearn", "model_performance", "clustering")
12
+ @tasks("clustering")
13
+ def CompletenessScore(model: VMModel, dataset: VMDataset):
14
14
  """
15
15
  Evaluates a clustering model's capacity to categorize instances from a single class into the same cluster.
16
16
 
@@ -47,14 +47,11 @@ class CompletenessScore(ClusterPerformance):
47
47
  - The Completeness Score only applies to clustering models; it cannot be used for other types of machine learning
48
48
  models.
49
49
  """
50
-
51
- name = "homogeneity_score"
52
- required_inputs = ["model", "dataset"]
53
- tasks = ["clustering"]
54
- tags = [
55
- "sklearn",
56
- "model_performance",
50
+ return [
51
+ {
52
+ "Completeness Score": completeness_score(
53
+ labels_true=dataset.y,
54
+ labels_pred=dataset.y_pred(model),
55
+ )
56
+ }
57
57
  ]
58
-
59
- def metric_info(self):
60
- return {"Completeness Score": metrics.completeness_score}