validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +80 -119
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/api_client.py +89 -43
  9. validmind/client.py +2 -2
  10. validmind/client_config.py +11 -14
  11. validmind/datasets/credit_risk/__init__.py +1 -0
  12. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  13. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  14. validmind/datasets/regression/fred_timeseries.py +67 -138
  15. validmind/template.py +1 -0
  16. validmind/test_suites/__init__.py +0 -2
  17. validmind/test_suites/statsmodels_timeseries.py +1 -1
  18. validmind/test_suites/summarization.py +0 -1
  19. validmind/test_suites/time_series.py +0 -43
  20. validmind/tests/__types__.py +14 -15
  21. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  22. validmind/tests/data_validation/ADF.py +31 -24
  23. validmind/tests/data_validation/AutoAR.py +9 -9
  24. validmind/tests/data_validation/AutoMA.py +23 -16
  25. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  26. validmind/tests/data_validation/AutoStationarity.py +21 -16
  27. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  28. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
  29. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
  30. validmind/tests/data_validation/ClassImbalance.py +15 -12
  31. validmind/tests/data_validation/DFGLSArch.py +19 -13
  32. validmind/tests/data_validation/DatasetDescription.py +17 -11
  33. validmind/tests/data_validation/DatasetSplit.py +7 -5
  34. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  35. validmind/tests/data_validation/Duplicates.py +33 -25
  36. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  37. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  38. validmind/tests/data_validation/HighCardinality.py +19 -12
  39. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  40. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  41. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  42. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  43. validmind/tests/data_validation/JarqueBera.py +70 -0
  44. validmind/tests/data_validation/KPSS.py +34 -29
  45. validmind/tests/data_validation/LJungBox.py +66 -0
  46. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  47. validmind/tests/data_validation/MissingValues.py +32 -27
  48. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  49. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  50. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  51. validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
  52. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  53. validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
  54. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
  55. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  56. validmind/tests/data_validation/RunsTest.py +72 -0
  57. validmind/tests/data_validation/ScatterPlot.py +63 -78
  58. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  59. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
  60. validmind/tests/data_validation/Skewness.py +35 -37
  61. validmind/tests/data_validation/SpreadPlot.py +35 -35
  62. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  63. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  64. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  65. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  66. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  67. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  68. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  69. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  70. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  71. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  72. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  73. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  74. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  75. validmind/tests/data_validation/UniqueRows.py +11 -6
  76. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  77. validmind/tests/data_validation/WOEBinTable.py +35 -30
  78. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  79. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  80. validmind/tests/data_validation/nlp/Hashtags.py +42 -40
  81. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  82. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  83. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  84. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  85. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  86. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  87. validmind/tests/data_validation/nlp/TextDescription.py +39 -36
  88. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  89. validmind/tests/decorator.py +81 -42
  90. validmind/tests/model_validation/BertScore.py +36 -27
  91. validmind/tests/model_validation/BleuScore.py +25 -19
  92. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  93. validmind/tests/model_validation/ContextualRecall.py +38 -13
  94. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  95. validmind/tests/model_validation/MeteorScore.py +46 -33
  96. validmind/tests/model_validation/ModelMetadata.py +32 -64
  97. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  98. validmind/tests/model_validation/RegardScore.py +30 -14
  99. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  100. validmind/tests/model_validation/RougeScore.py +36 -30
  101. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  102. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  103. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  104. validmind/tests/model_validation/TokenDisparity.py +31 -23
  105. validmind/tests/model_validation/ToxicityScore.py +26 -17
  106. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  107. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  108. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  109. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  110. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  111. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  112. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  113. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  114. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  115. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  116. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  117. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  118. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  119. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  120. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  121. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  122. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  123. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  124. validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
  125. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  126. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  127. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  128. validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
  129. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  130. validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
  131. validmind/tests/model_validation/ragas/utils.py +6 -0
  132. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  133. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  134. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  135. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  136. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  137. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  138. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  139. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  140. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  141. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  142. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  143. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  144. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  145. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  146. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  147. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  148. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  149. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  150. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
  151. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  152. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  153. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  154. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  155. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  156. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  157. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
  158. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  159. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  160. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
  161. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  162. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  163. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  164. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  165. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  166. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  167. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
  168. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  169. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  170. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  171. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
  172. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  173. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  174. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  175. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  176. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  177. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  178. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  179. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  180. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  181. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  182. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  183. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  184. validmind/tests/prompt_validation/Bias.py +14 -11
  185. validmind/tests/prompt_validation/Clarity.py +16 -14
  186. validmind/tests/prompt_validation/Conciseness.py +7 -5
  187. validmind/tests/prompt_validation/Delimitation.py +23 -22
  188. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  189. validmind/tests/prompt_validation/Robustness.py +12 -10
  190. validmind/tests/prompt_validation/Specificity.py +13 -11
  191. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  192. validmind/tests/run.py +68 -23
  193. validmind/unit_metrics/__init__.py +81 -144
  194. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  195. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  196. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  197. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  198. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  199. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  200. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  201. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  202. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  203. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  204. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  205. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  206. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  207. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  208. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  209. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  210. validmind/utils.py +4 -0
  211. validmind/vm_models/dataset/dataset.py +2 -0
  212. validmind/vm_models/figure.py +5 -0
  213. validmind/vm_models/test/metric.py +1 -0
  214. validmind/vm_models/test/result_wrapper.py +143 -158
  215. validmind/vm_models/test/threshold_test.py +1 -0
  216. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
  217. validmind-2.5.18.dist-info/RECORD +324 -0
  218. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  219. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  220. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  221. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  222. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  223. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  224. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  225. validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
  226. validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
  227. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  228. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  229. validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
  230. validmind-2.5.8.dist-info/RECORD +0 -318
  231. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
  232. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
  233. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -2,105 +2,66 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
5
+ import pandas as pd
7
6
  from sklearn import metrics
8
7
 
8
+ from validmind import tags, tasks
9
9
  from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
10
- from validmind.vm_models import Metric, ResultSummary, ResultTable
11
10
 
12
11
 
13
- @dataclass
14
- class RegressionR2Square(Metric):
12
+ @tags("sklearn", "model_performance")
13
+ @tasks("regression")
14
+ def RegressionR2Square(dataset, model):
15
15
  """
16
- **Purpose**: The purpose of the RegressionR2Square Metric test is to measure the overall goodness-of-fit of a
17
- regression model. Specifically, this Python-based test evaluates the R-squared (R2) and Adjusted R-squared (Adj R2)
18
- scores: two statistical measures within regression analysis used to evaluate the strength of the relationship
19
- between the model's predictors and the response variable.
20
-
21
- **Test Mechanism**: The test deploys the 'r2_score' method from the Scikit-learn metrics module, measuring the R2
22
- score on both training and test sets. This score reflects the proportion of the variance in the dependent variable
23
- that is predictable from independent variables. The test also considers the Adjusted R2 score, accounting for the
24
- number of predictors in the model, to penalize model complexity and thus reduce overfitting. The Adjusted R2 score
25
- will be smaller if unnecessary predictors are included in the model.
26
-
27
- **Signs of High Risk**: Indicators of high risk in this test may include a low R2 or Adjusted R2 score, which would
28
- suggest that the model does not explain much variation in the dependent variable. The occurrence of overfitting is
29
- also a high-risk sign, evident when the R2 score on the training set is significantly higher than on the test set,
30
- indicating that the model is not generalizing well to unseen data.
31
-
32
- **Strengths**: The R2 score is a widely-used measure in regression analysis, providing a sound general indication
33
- of model performance. It is easy to interpret and understand, as it is essentially representing the proportion of
34
- the dependent variable's variance explained by the independent variables. The Adjusted R2 score complements the R2
35
- score well by taking into account the number of predictors in the model, which helps control overfitting.
36
-
37
- **Limitations**: R2 and Adjusted R2 scores can be sensitive to the inclusion of unnecessary predictors in the model
38
- (even though Adjusted R2 is intended to penalize complexity). Their reliability might also lessen in cases of
39
- non-linear relationships or when the underlying assumptions of linear regression are violated. Additionally, while
40
- they summarize how well the model fits the data, they do not provide insight on whether the correct regression was
41
- used, or whether certain key assumptions have been fulfilled.
16
+ Assesses the overall goodness-of-fit of a regression model by evaluating R-squared (R2) and Adjusted R-squared (Adj
17
+ R2) scores to determine the model's explanatory power over the dependent variable.
18
+
19
+ ### Purpose
20
+
21
+ The purpose of the RegressionR2Square Metric test is to measure the overall goodness-of-fit of a regression model.
22
+ Specifically, this Python-based test evaluates the R-squared (R2) and Adjusted R-squared (Adj R2) scores, which are
23
+ statistical measures used to assess the strength of the relationship between the model's predictors and the
24
+ response variable.
25
+
26
+ ### Test Mechanism
27
+
28
+ The test deploys the `r2_score` method from the Scikit-learn metrics module to measure the R2 score on both
29
+ training and test sets. This score reflects the proportion of the variance in the dependent variable that is
30
+ predictable from the independent variables. The test also calculates the Adjusted R2 score, which accounts for the
31
+ number of predictors in the model to penalize model complexity and reduce overfitting. The Adjusted R2 score will
32
+ be smaller if unnecessary predictors are included in the model.
33
+
34
+ ### Signs of High Risk
35
+
36
+ - Low R2 or Adjusted R2 scores, suggesting that the model does not explain much variation in the dependent variable.
37
+ - Significant discrepancy between R2 scores on the training set and test set, indicating overfitting and poor
38
+ generalization to unseen data.
39
+
40
+ ### Strengths
41
+
42
+ - Widely-used measure in regression analysis, providing a sound general indication of model performance.
43
+ - Easy to interpret and understand, as it represents the proportion of the dependent variable's variance explained
44
+ by the independent variables.
45
+ - Adjusted R2 score helps control overfitting by penalizing unnecessary predictors.
46
+
47
+ ### Limitations
48
+
49
+ - Sensitive to the inclusion of unnecessary predictors even though Adjusted R2 penalizes complexity.
50
+ - Less reliable in cases of non-linear relationships or when the underlying assumptions of linear regression are
51
+ violated.
52
+ - Does not provide insight on whether the correct regression model was used or if key assumptions have been met.
42
53
  """
43
54
 
44
- name = "regression_errors_r2_square"
45
- required_inputs = ["model", "datasets"]
46
- tasks = ["regression"]
47
- tags = [
48
- "sklearn",
49
- "model_performance",
50
- ]
51
-
52
- def summary(self, raw_results):
53
- """
54
- Returns a summarized representation of the dataset split information
55
- """
56
- table_records = []
57
- for result in raw_results:
58
- for key, _ in result.items():
59
- table_records.append(
60
- {
61
- "Metric": key,
62
- "TRAIN": result[key]["train"],
63
- "TEST": result[key]["test"],
64
- }
65
- )
66
-
67
- return ResultSummary(results=[ResultTable(data=table_records)])
68
-
69
- def run(self):
70
- y_train_true = self.inputs.datasets[0].y
71
- y_train_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
72
- y_train_true = y_train_true.astype(y_train_pred.dtype)
73
-
74
- y_test_true = self.inputs.datasets[1].y
75
- y_test_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
76
- y_test_true = y_test_true.astype(y_test_pred.dtype)
77
-
78
- r2s_train = metrics.r2_score(y_train_true, y_train_pred)
79
- r2s_test = metrics.r2_score(y_test_true, y_test_pred)
80
-
81
- results = []
82
- results.append(
83
- {
84
- "R-squared (R2) Score": {
85
- "train": r2s_train,
86
- "test": r2s_test,
87
- }
88
- }
89
- )
90
-
91
- X_columns = self.inputs.datasets[0].feature_columns
92
- adj_r2_train = adj_r2_score(
93
- y_train_true, y_train_pred, len(y_train_true), len(X_columns)
94
- )
95
- adj_r2_test = adj_r2_score(
96
- y_test_true, y_test_pred, len(y_test_true), len(X_columns)
97
- )
98
- results.append(
99
- {
100
- "Adjusted R-squared (R2) Score": {
101
- "train": adj_r2_train,
102
- "test": adj_r2_test,
103
- }
104
- }
105
- )
106
- return self.cache_results(metric_value=results)
55
+ y_true = dataset.y
56
+ y_pred = dataset.y_pred(model)
57
+ y_true = y_true.astype(y_pred.dtype)
58
+
59
+ r2s = metrics.r2_score(y_true, y_pred)
60
+ adj_r2 = adj_r2_score(y_true, y_pred, len(y_true), len(dataset.feature_columns))
61
+
62
+ # Create dataframe with R2 and Adjusted R2 in one row
63
+ results_df = pd.DataFrame(
64
+ {"R-squared (R2) Score": [r2s], "Adjusted R-squared (R2) Score": [adj_r2]}
65
+ )
66
+
67
+ return results_df
@@ -13,26 +13,45 @@ from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
13
13
  @tasks("regression", "time_series_forecasting")
14
14
  def RegressionR2SquareComparison(datasets, models):
15
15
  """
16
- Compare R-Squared and Adjusted R-Squared values for each model and generate a summary table
17
- with the results.
16
+ Compares R-Squared and Adjusted R-Squared values for different regression models across multiple datasets to assess
17
+ model performance and relevance of features.
18
18
 
19
- **Purpose**: The purpose of this function is to compare the R-Squared and Adjusted R-Squared values for different models applied to various datasets.
19
+ ### Purpose
20
20
 
21
- **Test Mechanism**: The function iterates through each dataset-model pair, calculates the R-Squared and Adjusted R-Squared values, and generates a summary table with these results.
21
+ The Regression R2 Square Comparison test aims to compare the R-Squared and Adjusted R-Squared values for different
22
+ regression models across various datasets. It helps in assessing how well each model explains the variability in
23
+ the dataset, and whether the models include irrelevant features.
22
24
 
23
- **Signs of High Risk**:
24
- - If the R-Squared values are significantly low, it could indicate that the model is not explaining much of the variability in the dataset.
25
- - A significant difference between R-Squared and Adjusted R-Squared values might indicate that the model includes irrelevant features.
25
+ ### Test Mechanism
26
+
27
+ This test operates by:
28
+
29
+ - Iterating through each dataset-model pair.
30
+ - Calculating the R-Squared values to measure how much of the variability in the dataset is explained by the model.
31
+ - Calculating the Adjusted R-Squared values, which adjust the R-Squared based on the number of predictors in the
32
+ model, making it more reliable when comparing models with different numbers of features.
33
+ - Generating a summary table containing these values for each combination of dataset and model.
34
+
35
+ ### Signs of High Risk
36
+
37
+ - If the R-Squared values are significantly low, it indicates the model isn't explaining much of the variability in
38
+ the dataset.
39
+ - A significant difference between R-Squared and Adjusted R-Squared values might indicate that the model includes
40
+ irrelevant features.
41
+
42
+ ### Strengths
26
43
 
27
- **Strengths**:
28
44
  - Provides a quantitative measure of model performance in terms of variance explained.
29
- - Adjusted R-Squared accounts for the number of predictors, making it a more reliable measure when comparing models with different numbers of features.
45
+ - Adjusted R-Squared accounts for the number of predictors, making it a more reliable measure when comparing models
46
+ with different numbers of features.
47
+ - Useful for time-series forecasting and regression tasks.
30
48
 
31
- **Limitations**:
32
- - Assumes that the dataset is provided as a DataFrameDataset object with `y`, `y_pred`, and `feature_columns` attributes.
33
- - The function relies on `adj_r2_score` from the `statsmodels.statsutils` module, which should be correctly implemented and imported.
34
- - Requires that `dataset.y_pred(model)` returns the predicted values for the model.
49
+ ### Limitations
35
50
 
51
+ - Assumes the dataset is provided as a DataFrameDataset object with `y`, `y_pred`, and `feature_columns` attributes.
52
+ - Relies on `adj_r2_score` from the `statsmodels.statsutils` module, which needs to be correctly implemented and
53
+ imported.
54
+ - Requires that `dataset.y_pred(model)` returns the predicted values for the model.
36
55
  """
37
56
  results_list = []
38
57
 
@@ -315,38 +315,42 @@ def robustness_diagnosis(
315
315
 
316
316
  @dataclass
317
317
  class RobustnessDiagnosis(ThresholdTest):
318
- """Evaluate the robustness of a machine learning model to noise
319
-
320
- Robustness refers to a model's ability to maintain a high level of performance in
321
- the face of perturbations or changes (particularly noise) added to its input data.
322
- This test is designed to help gauge how well the model can handle potential real-
323
- world scenarios where the input data might be incomplete or corrupted.
324
-
325
- ## Test Methodology
326
- This test is conducted by adding Gaussian noise, proportional to a particular standard
327
- deviation scale, to numeric input features of the input datasets. The model's
328
- performance on the perturbed data is then evaluated using a user-defined metric or the
329
- default metric of AUC for classification tasks and MSE for regression tasks. The results
330
- are then plotted to visualize the model's performance decay as the perturbation size
331
- increases.
332
-
333
- When using this test, it is highly recommended to tailor the performance metric, list
334
- of scaling factors for the standard deviation of the noise, and the performance decay
335
- threshold to the specific use case of the model being evaluated.
336
-
337
- **Inputs**:
338
- - model (VMModel): The trained model to be evaluated.
339
- - datasets (List[VMDataset]): A list of datasets to evaluate the model against.
340
-
341
- ## Parameters
342
- - metric (str, optional): The performance metric to be used for evaluation. If not
343
- provided, the default metric is used based on the task of the model. Default values
344
- are "auc" for classification tasks and "mse" for regression tasks.
345
- - scaling_factor_std_dev_list (List[float], optional): A list of scaling factors for
346
- the standard deviation of the noise to be added to the input features. The default
347
- values are [0.1, 0.2, 0.3, 0.4, 0.5].
348
- - performance_decay_threshold (float, optional): The threshold for the performance
349
- decay of the model. The default value is 0.05.
318
+ """
319
+ Assesses the robustness of a machine learning model by evaluating performance decay under noisy conditions.
320
+
321
+ ### Purpose
322
+
323
+ The Robustness Diagnosis test aims to evaluate the resilience of a machine learning model when subjected to
324
+ perturbations or noise in its input data. This is essential for understanding the model's ability to handle
325
+ real-world scenarios where data may be imperfect or corrupted.
326
+
327
+ ### Test Mechanism
328
+
329
+ This test introduces Gaussian noise to the numeric input features of the datasets at varying scales of standard
330
+ deviation. The performance of the model is then measured using a specified metric. The process includes:
331
+
332
+ - Adding Gaussian noise to numerical input features based on scaling factors.
333
+ - Evaluating the model's performance on the perturbed data using metrics like AUC for classification tasks and MSE
334
+ for regression tasks.
335
+ - Aggregating and plotting the results to visualize performance decay relative to perturbation size.
336
+
337
+ ### Signs of High Risk
338
+
339
+ - A significant drop in performance metrics with minimal noise.
340
+ - Performance decay values exceeding the specified threshold.
341
+ - Consistent failure to meet performance standards across multiple perturbation scales.
342
+
343
+ ### Strengths
344
+
345
+ - Provides insights into the model's robustness against noisy or corrupted data.
346
+ - Utilizes a variety of performance metrics suitable for both classification and regression tasks.
347
+ - Visualization helps in understanding the extent of performance degradation.
348
+
349
+ ### Limitations
350
+
351
+ - Gaussian noise might not adequately represent all types of real-world data perturbations.
352
+ - Performance thresholds are somewhat arbitrary and might need tuning.
353
+ - The test may not account for more complex or unstructured noise patterns that could affect model robustness.
350
354
  """
351
355
 
352
356
  name = "robustness"
@@ -22,13 +22,15 @@ class SHAPGlobalImportance(Metric):
22
22
  """
23
23
  Evaluates and visualizes global feature importance using SHAP values for model explanation and risk identification.
24
24
 
25
- **Purpose:**
25
+ ### Purpose
26
+
26
27
  The SHAP (SHapley Additive exPlanations) Global Importance metric aims to elucidate model outcomes by attributing
27
28
  them to the contributing features. It assigns a quantifiable global importance to each feature via their respective
28
29
  absolute Shapley values, thereby making it suitable for tasks like classification (both binary and multiclass).
29
30
  This metric forms an essential part of model risk management.
30
31
 
31
- **Test Mechanism:**
32
+ ### Test Mechanism
33
+
32
34
  The exam begins with the selection of a suitable explainer which aligns with the model's type. For tree-based
33
35
  models like XGBClassifier, RandomForestClassifier, CatBoostClassifier, TreeExplainer is used whereas for linear
34
36
  models like LogisticRegression, XGBRegressor, LinearRegression, it is the LinearExplainer. Once the explainer
@@ -44,20 +46,20 @@ class SHAPGlobalImportance(Metric):
44
46
  gradually changing from low to high. Features are systematically organized in accordance with their importance.
45
47
  These plots are generated by the function `_generate_shap_plot()`.
46
48
 
47
- **Signs of High Risk:**
49
+ ### Signs of High Risk
48
50
 
49
51
  - Overemphasis on certain features in SHAP importance plots, thus hinting at the possibility of model overfitting
50
52
  - Anomalies such as unexpected or illogical features showing high importance, which might suggest that the model's
51
53
  decisions are rooted in incorrect or undesirable reasoning
52
54
  - A SHAP summary plot filled with high variability or scattered data points, indicating a cause for concern
53
55
 
54
- **Strengths:**
56
+ ### Strengths
55
57
 
56
58
  - SHAP does more than just illustrating global feature significance, it offers a detailed perspective on how
57
59
  different features shape the model's decision-making logic for each instance.
58
60
  - It provides clear insights into model behavior.
59
61
 
60
- **Limitations:**
62
+ ### Limitations
61
63
 
62
64
  - High-dimensional data can convolute interpretations.
63
65
  - Associating importance with tangible real-world impact still involves a certain degree of subjectivity.
@@ -76,6 +78,7 @@ class SHAPGlobalImportance(Metric):
76
78
  default_params = {
77
79
  "kernel_explainer_samples": 10,
78
80
  "tree_or_linear_explainer_samples": 200,
81
+ "class_of_interest": None,
79
82
  }
80
83
 
81
84
  def _generate_shap_plot(self, type_, shap_values, x_test):
@@ -105,6 +108,7 @@ class SHAPGlobalImportance(Metric):
105
108
  shap_values / max_shap_value * 100
106
109
  ) # scaling factor to make the top feature 100%
107
110
  summary_plot_extra_args = {"plot_type": "bar"}
111
+
108
112
  shap.summary_plot(
109
113
  shap_values, x_test, show=False, **summary_plot_extra_args
110
114
  )
@@ -190,6 +194,10 @@ class SHAPGlobalImportance(Metric):
190
194
 
191
195
  shap_values = explainer.shap_values(shap_sample)
192
196
 
197
+ # Select the SHAP values for the specified class (classification) or for the regression output.
198
+ class_of_interest = self.params["class_of_interest"]
199
+ shap_values = _select_shap_values(shap_values, class_of_interest)
200
+
193
201
  figures = [
194
202
  self._generate_shap_plot("mean", shap_values, shap_sample),
195
203
  self._generate_shap_plot("summary", shap_values, shap_sample),
@@ -212,3 +220,56 @@ class SHAPGlobalImportance(Metric):
212
220
  for fig_num, type_ in enumerate(["mean", "summary"], start=1):
213
221
  assert isinstance(self.result.figures[fig_num - 1], Figure)
214
222
  assert self.result.figures[fig_num - 1].metadata["type"] == type_
223
+
224
+
225
+ def _select_shap_values(shap_values, class_of_interest=None):
226
+ """
227
+ Selects SHAP values for binary or multiclass classification. For regression models,
228
+ returns the SHAP values directly as there are no classes.
229
+
230
+ Parameters:
231
+ -----------
232
+ shap_values : list or numpy.ndarray
233
+ The SHAP values returned by the SHAP explainer. For multiclass classification,
234
+ this will be a list where each element corresponds to a class. For regression,
235
+ this will be a single array of SHAP values.
236
+
237
+ class_of_interest : int, optional
238
+ The class index for which to retrieve SHAP values. If None (default), the function
239
+ will assume binary classification and use class 1 by default.
240
+
241
+ Returns:
242
+ --------
243
+ numpy.ndarray
244
+ The SHAP values for the specified class (classification) or for the regression output.
245
+
246
+ Raises:
247
+ -------
248
+ ValueError
249
+ If class_of_interest is specified and is out of bounds for the number of classes.
250
+ """
251
+ # Check if we are dealing with a multiclass classification
252
+ if isinstance(shap_values, list):
253
+ num_classes = len(shap_values)
254
+
255
+ # Default to class 1 for binary classification
256
+ if num_classes == 2 and class_of_interest is None:
257
+ logger.info(
258
+ "Binary classification detected: using SHAP values for class 1 (positive class)."
259
+ )
260
+ return shap_values[1]
261
+ else:
262
+ # Multiclass classification: use the specified class_of_interest
263
+ if class_of_interest is not None and 0 <= class_of_interest < num_classes:
264
+ logger.info(
265
+ f"Multiclass classification: using SHAP values for class {class_of_interest}."
266
+ )
267
+ return shap_values[class_of_interest]
268
+ else:
269
+ raise ValueError(
270
+ f"Invalid class_of_interest: {class_of_interest}. Must be between 0 and {num_classes - 1}."
271
+ )
272
+ else:
273
+ # For regression, return the SHAP values as they are
274
+ logger.info("Regression model detected: returning SHAP values as-is.")
275
+ return shap_values
@@ -20,36 +20,44 @@ from validmind.vm_models import (
20
20
  @dataclass
21
21
  class SilhouettePlot(Metric):
22
22
  """
23
- Calculates and visualizes Silhouette Score, assessing degree of data point suitability to its cluster in ML models.
24
-
25
- **Purpose:** This test calculates the Silhouette Score, which is a model performance metric used in clustering
26
- applications. Primarily, the Silhouette Score evaluates how similar an object (data point) is to its own cluster
27
- compared to other clusters. The metric ranges between -1 and 1, where a high value indicates that the object is
28
- well matched to its own cluster and poorly matched to neighboring clusters. Thus, the goal is to achieve a high
29
- Silhouette Score, implying well-separated clusters.
30
-
31
- **Test Mechanism:** The test first extracts the true and predicted labels from the model's training data. The test
32
- runs the Silhouette Score function, which takes as input the training dataset features and the predicted labels,
33
- subsequently calculating the average score. This average Silhouette Score is printed for reference. The script then
34
- calculates the silhouette coefficients for each data point, helping to form the Silhouette Plot. Each cluster is
35
- represented in this plot, with color distinguishing between different clusters. A red dashed line indicates the
36
- average Silhouette Score. The Silhouette Scores are also collected into a structured table, facilitating model
37
- performance analysis and comparison.
38
-
39
- **Signs of High Risk:**
23
+ Calculates and visualizes Silhouette Score, assessing the degree of data point suitability to its cluster in ML
24
+ models.
25
+
26
+ ### Purpose
27
+
28
+ This test calculates the Silhouette Score, which is a model performance metric used in clustering applications.
29
+ Primarily, the Silhouette Score evaluates how similar a data point is to its own cluster compared to other
30
+ clusters. The metric ranges between -1 and 1, where a high value indicates that the object is well matched to its
31
+ own cluster and poorly matched to neighboring clusters. Thus, the goal is to achieve a high Silhouette Score,
32
+ implying well-separated clusters.
33
+
34
+ ### Test Mechanism
35
+
36
+ The test first extracts the true and predicted labels from the model's training data. The test runs the Silhouette
37
+ Score function, which takes as input the training dataset features and the predicted labels, subsequently
38
+ calculating the average score. This average Silhouette Score is printed for reference. The script then calculates
39
+ the silhouette coefficients for each data point, helping to form the Silhouette Plot. Each cluster is represented
40
+ in this plot, with color distinguishing between different clusters. A red dashed line indicates the average
41
+ Silhouette Score. The Silhouette Scores are also collected into a structured table, facilitating model performance
42
+ analysis and comparison.
43
+
44
+ ### Signs of High Risk
45
+
40
46
  - A low Silhouette Score, potentially indicating that the clusters are not well separated and that data points may
41
47
  not be fitting well to their respective clusters.
42
48
  - A Silhouette Plot displaying overlapping clusters or the absence of clear distinctions between clusters visually
43
49
  also suggests poor clustering performance.
44
50
 
45
- **Strengths:**
51
+ ### Strengths
52
+
46
53
  - The Silhouette Score provides a clear and quantitative measure of how well data points have been grouped into
47
54
  clusters, offering insights into model performance.
48
55
  - The Silhouette Plot provides an intuitive, graphical representation of the clustering mechanism, aiding visual
49
56
  assessments of model performance.
50
57
  - It does not require ground truth labels, so it's useful when true cluster assignments are not known.
51
58
 
52
- **Limitations:**
59
+ ### Limitations
60
+
53
61
  - The Silhouette Score may be susceptible to the influence of outliers, which could impact its accuracy and
54
62
  reliability.
55
63
  - It assumes the clusters are convex and isotropic, which might not be the case with complex datasets.
@@ -32,33 +32,40 @@ class TrainingTestDegradation(ThresholdTest):
32
32
  """
33
33
  Tests if model performance degradation between training and test datasets exceeds a predefined threshold.
34
34
 
35
- **Purpose**: The 'TrainingTestDegradation' class serves as a test to verify that the degradation in performance
36
- between the training and test datasets does not exceed a predefined threshold. This test serves as a measure to
37
- check the model's ability to generalize from its training data to unseen test data. It assesses key classification
38
- metric scores such as accuracy, precision, recall and f1 score, to verify the model's robustness and reliability.
39
-
40
- **Test Mechanism**: The code applies several predefined metrics including accuracy, precision, recall and f1 scores
41
- to the model's predictions for both the training and test datasets. It calculates the degradation as the difference
42
- between the training score and test score divided by the training score. The test is considered successful if the
43
- degradation for each metric is less than the preset maximum threshold of 10%. The results are summarized in a table
44
- showing each metric's train score, test score, degradation percentage, and pass/fail status.
45
-
46
- **Signs of High Risk**:
35
+ ### Purpose
36
+
37
+ The `TrainingTestDegradation` class serves as a test to verify that the degradation in performance between the
38
+ training and test datasets does not exceed a predefined threshold. This test measures the model's ability to
39
+ generalize from its training data to unseen test data, assessing key classification metrics such as accuracy,
40
+ precision, recall, and f1 score to verify the model's robustness and reliability.
41
+
42
+ ### Test Mechanism
43
+
44
+ The code applies several predefined metrics, including accuracy, precision, recall, and f1 scores, to the model's
45
+ predictions for both the training and test datasets. It calculates the degradation as the difference between the
46
+ training score and test score divided by the training score. The test is considered successful if the degradation
47
+ for each metric is less than the preset maximum threshold of 10%. The results are summarized in a table showing
48
+ each metric's train score, test score, degradation percentage, and pass/fail status.
49
+
50
+ ### Signs of High Risk
51
+
47
52
  - A degradation percentage that exceeds the maximum allowed threshold of 10% for any of the evaluated metrics.
48
53
  - A high difference or gap between the metric scores on the training and the test datasets.
49
54
  - The 'Pass/Fail' column displaying 'Fail' for any of the evaluated metrics.
50
55
 
51
- **Strengths**:
52
- - This test provides a quantitative measure of the model's ability to generalize to unseen data, which is key for
53
- predicting its practical real-world performance.
56
+ ### Strengths
57
+
58
+ - Provides a quantitative measure of the model's ability to generalize to unseen data, which is key for predicting
59
+ its practical real-world performance.
54
60
  - By evaluating multiple metrics, it takes into account different facets of model performance and enables a more
55
61
  holistic evaluation.
56
62
  - The use of a variable predefined threshold allows the flexibility to adjust the acceptability criteria for
57
63
  different scenarios.
58
64
 
59
- **Limitations**:
60
- - The test compares raw performance on training and test data, but does not factor in the nature of the data. Areas
61
- with less representation in the training set, for instance, might still perform poorly on unseen data.
65
+ ### Limitations
66
+
67
+ - The test compares raw performance on training and test data but does not factor in the nature of the data. Areas
68
+ with less representation in the training set might still perform poorly on unseen data.
62
69
  - It requires good coverage and balance in the test and training datasets to produce reliable results, which may
63
70
  not always be available.
64
71
  - The test is currently only designed for classification tasks.
@@ -14,42 +14,43 @@ class VMeasure(ClusterPerformance):
14
14
  """
15
15
  Evaluates homogeneity and completeness of a clustering model using the V Measure Score.
16
16
 
17
- **1. Purpose:**
17
+ ### Purpose
18
+
18
19
  The purpose of this metric, V Measure Score (V Score), is to evaluate the performance of a clustering model. It
19
20
  measures the homogeneity and completeness of a set of cluster labels, where homogeneity refers to each cluster
20
21
  containing only members of a single class and completeness meaning all members of a given class are assigned to the
21
22
  same cluster.
22
23
 
23
- **2. Test Mechanism:**
24
- ClusterVMeasure is a class that inherits from another class, ClusterPerformance. It uses the v_measure_score
24
+ ### Test Mechanism
25
+
26
+ ClusterVMeasure is a class that inherits from another class, ClusterPerformance. It uses the `v_measure_score`
25
27
  function from the sklearn module's metrics package. The required inputs to perform this metric are the model, train
26
28
  dataset, and test dataset. The test is appropriate for models tasked with clustering.
27
29
 
28
- **3. Signs of High Risk:**
30
+ ### Signs of High Risk
29
31
 
30
32
  - Low V Measure Score: A low V Measure Score indicates that the clustering model has poor homogeneity or
31
33
  completeness, or both. This might signal that the model is failing to correctly cluster the data.
32
34
 
33
- **4. Strengths:**
35
+ ### Strengths
34
36
 
35
37
  - The V Measure Score is a harmonic mean between homogeneity and completeness. This ensures that both attributes
36
38
  are taken into account when evaluating the model, providing an overall measure of its cluster validity.
37
-
38
39
  - The metric does not require knowledge of the ground truth classes when measuring homogeneity and completeness,
39
40
  making it applicable in instances where such information is unavailable.
40
41
 
41
- **5. Limitations:**
42
-
43
- - The V Score can be influenced by the number of clusters, which means that it might not always reflect the quality
44
- of the clustering. Partitioning the data into many small clusters could lead to high homogeneity but low
45
- completeness, leading to a low V Score even if the clustering might be useful.
42
+ ### Limitations
46
43
 
44
+ - The V Measure Score can be influenced by the number of clusters, which means that it might not always reflect the
45
+ quality of the clustering. Partitioning the data into many small clusters could lead to high homogeneity but low
46
+ completeness, leading to a low V Measure Score even if the clustering might be useful.
47
47
  - It assumes equal importance of homogeneity and completeness. In some applications, one may be more important than
48
- the other. The V Score does not provide flexibility in assigning different weights to homogeneity and completeness.
48
+ the other. The V Measure Score does not provide flexibility in assigning different weights to homogeneity and
49
+ completeness.
49
50
  """
50
51
 
51
52
  name = "v_measure_score"
52
- required_inputs = ["model", "datasets"]
53
+ required_inputs = ["model", "dataset"]
53
54
  tasks = ["clustering"]
54
55
  tags = [
55
56
  "sklearn",