validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +80 -119
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/api_client.py +89 -43
  9. validmind/client.py +2 -2
  10. validmind/client_config.py +11 -14
  11. validmind/datasets/credit_risk/__init__.py +1 -0
  12. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  13. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  14. validmind/datasets/regression/fred_timeseries.py +67 -138
  15. validmind/template.py +1 -0
  16. validmind/test_suites/__init__.py +0 -2
  17. validmind/test_suites/statsmodels_timeseries.py +1 -1
  18. validmind/test_suites/summarization.py +0 -1
  19. validmind/test_suites/time_series.py +0 -43
  20. validmind/tests/__types__.py +14 -15
  21. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  22. validmind/tests/data_validation/ADF.py +31 -24
  23. validmind/tests/data_validation/AutoAR.py +9 -9
  24. validmind/tests/data_validation/AutoMA.py +23 -16
  25. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  26. validmind/tests/data_validation/AutoStationarity.py +21 -16
  27. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  28. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
  29. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
  30. validmind/tests/data_validation/ClassImbalance.py +15 -12
  31. validmind/tests/data_validation/DFGLSArch.py +19 -13
  32. validmind/tests/data_validation/DatasetDescription.py +17 -11
  33. validmind/tests/data_validation/DatasetSplit.py +7 -5
  34. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  35. validmind/tests/data_validation/Duplicates.py +33 -25
  36. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  37. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  38. validmind/tests/data_validation/HighCardinality.py +19 -12
  39. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  40. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  41. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  42. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  43. validmind/tests/data_validation/JarqueBera.py +70 -0
  44. validmind/tests/data_validation/KPSS.py +34 -29
  45. validmind/tests/data_validation/LJungBox.py +66 -0
  46. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  47. validmind/tests/data_validation/MissingValues.py +32 -27
  48. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  49. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  50. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  51. validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
  52. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  53. validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
  54. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
  55. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  56. validmind/tests/data_validation/RunsTest.py +72 -0
  57. validmind/tests/data_validation/ScatterPlot.py +63 -78
  58. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  59. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
  60. validmind/tests/data_validation/Skewness.py +35 -37
  61. validmind/tests/data_validation/SpreadPlot.py +35 -35
  62. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  63. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  64. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  65. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  66. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  67. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  68. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  69. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  70. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  71. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  72. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  73. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  74. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  75. validmind/tests/data_validation/UniqueRows.py +11 -6
  76. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  77. validmind/tests/data_validation/WOEBinTable.py +35 -30
  78. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  79. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  80. validmind/tests/data_validation/nlp/Hashtags.py +42 -40
  81. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  82. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  83. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  84. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  85. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  86. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  87. validmind/tests/data_validation/nlp/TextDescription.py +39 -36
  88. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  89. validmind/tests/decorator.py +81 -42
  90. validmind/tests/model_validation/BertScore.py +36 -27
  91. validmind/tests/model_validation/BleuScore.py +25 -19
  92. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  93. validmind/tests/model_validation/ContextualRecall.py +38 -13
  94. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  95. validmind/tests/model_validation/MeteorScore.py +46 -33
  96. validmind/tests/model_validation/ModelMetadata.py +32 -64
  97. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  98. validmind/tests/model_validation/RegardScore.py +30 -14
  99. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  100. validmind/tests/model_validation/RougeScore.py +36 -30
  101. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  102. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  103. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  104. validmind/tests/model_validation/TokenDisparity.py +31 -23
  105. validmind/tests/model_validation/ToxicityScore.py +26 -17
  106. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  107. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  108. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  109. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  110. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  111. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  112. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  113. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  114. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  115. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  116. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  117. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  118. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  119. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  120. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  121. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  122. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  123. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  124. validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
  125. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  126. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  127. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  128. validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
  129. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  130. validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
  131. validmind/tests/model_validation/ragas/utils.py +6 -0
  132. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  133. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  134. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  135. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  136. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  137. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  138. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  139. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  140. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  141. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  142. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  143. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  144. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  145. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  146. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  147. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  148. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  149. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  150. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
  151. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  152. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  153. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  154. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  155. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  156. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  157. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
  158. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  159. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  160. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
  161. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  162. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  163. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  164. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  165. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  166. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  167. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
  168. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  169. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  170. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  171. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
  172. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  173. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  174. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  175. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  176. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  177. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  178. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  179. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  180. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  181. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  182. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  183. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  184. validmind/tests/prompt_validation/Bias.py +14 -11
  185. validmind/tests/prompt_validation/Clarity.py +16 -14
  186. validmind/tests/prompt_validation/Conciseness.py +7 -5
  187. validmind/tests/prompt_validation/Delimitation.py +23 -22
  188. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  189. validmind/tests/prompt_validation/Robustness.py +12 -10
  190. validmind/tests/prompt_validation/Specificity.py +13 -11
  191. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  192. validmind/tests/run.py +68 -23
  193. validmind/unit_metrics/__init__.py +81 -144
  194. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  195. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  196. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  197. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  198. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  199. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  200. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  201. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  202. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  203. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  204. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  205. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  206. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  207. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  208. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  209. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  210. validmind/utils.py +4 -0
  211. validmind/vm_models/dataset/dataset.py +2 -0
  212. validmind/vm_models/figure.py +5 -0
  213. validmind/vm_models/test/metric.py +1 -0
  214. validmind/vm_models/test/result_wrapper.py +143 -158
  215. validmind/vm_models/test/threshold_test.py +1 -0
  216. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
  217. validmind-2.5.18.dist-info/RECORD +324 -0
  218. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  219. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  220. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  221. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  222. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  223. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  224. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  225. validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
  226. validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
  227. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  228. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  229. validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
  230. validmind-2.5.8.dist-info/RECORD +0 -318
  231. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
  232. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
  233. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -15,29 +15,36 @@ class HomogeneityScore(ClusterPerformance):
15
15
  Assesses clustering homogeneity by comparing true and predicted labels, scoring from 0 (heterogeneous) to 1
16
16
  (homogeneous).
17
17
 
18
- **Purpose**: The Homogeneity Score encapsulated in this performance test is used to measure the homogeneity of the
19
- clusters formed by a machine learning model. In simple terms, a clustering result satisfies homogeneity if all of
20
- its clusters contain only points which are members of a single class.
18
+ ### Purpose
21
19
 
22
- **Test Mechanism**: This test uses the `homogeneity_score` function from the `sklearn.metrics` library to compare
23
- the ground truth class labels of the training and testing sets with the labels predicted by the given model. The
24
- returned score is a metric of the clustering accuracy, and ranges from 0.0 to 1.0, with 1.0 denoting the highest
25
- possible degree of homogeneity.
20
+ The Homogeneity Score encapsulated in this performance test is used to measure the homogeneity of the clusters
21
+ formed by a machine learning model. In simple terms, a clustering result satisfies homogeneity if all of its
22
+ clusters contain only points which are members of a single class.
23
+
24
+ ### Test Mechanism
25
+
26
+ This test uses the `homogeneity_score` function from the `sklearn.metrics` library to compare the ground truth
27
+ class labels of the training and testing sets with the labels predicted by the given model. The returned score is a
28
+ metric of the clustering accuracy, and ranges from 0.0 to 1.0, with 1.0 denoting the highest possible degree of
29
+ homogeneity.
30
+
31
+ ### Signs of High Risk
26
32
 
27
- **Signs of High Risk**:
28
33
  - A score close to 0: This denotes that clusters are highly heterogenous and points within the same cluster might
29
34
  not belong to the same class.
30
35
  - A significantly lower score for testing data compared to the score for training data: This can indicate
31
36
  overfitting, where the model has learned to perfectly match the training data but fails to perform well on unseen
32
37
  data.
33
38
 
34
- **Strengths**:
39
+ ### Strengths
40
+
35
41
  - It provides a simple quantitative measure of the degree to which clusters contain points from only one class.
36
- - Useful for validating clustering solutions where the ground truth - class membership of points - is known.
42
+ - Useful for validating clustering solutions where the ground truth class membership of points is known.
37
43
  - It's agnostic to the absolute labels, and cares only that the points within the same cluster have the same class
38
44
  label.
39
45
 
40
- **Limitations**:
46
+ ### Limitations
47
+
41
48
  - The Homogeneity Score is not useful for clustering solutions where the ground truth labels are not known.
42
49
  - It doesn’t work well with differently sized clusters since it gives predominance to larger clusters.
43
50
  - The score does not address the actual number of clusters formed, or the evenness of cluster sizes. It only checks
@@ -45,7 +52,7 @@ class HomogeneityScore(ClusterPerformance):
45
52
  """
46
53
 
47
54
  name = "homogeneity_score"
48
- required_inputs = ["model", "datasets"]
55
+ required_inputs = ["model", "dataset"]
49
56
  tasks = ["clustering"]
50
57
  tags = [
51
58
  "sklearn",
@@ -16,37 +16,42 @@ class HyperParametersTuning(Metric):
16
16
  """
17
17
  Exerts exhaustive grid search to identify optimal hyperparameters for the model, improving performance.
18
18
 
19
- **Purpose:** The "HyperParametersTuning" metric being used here is intended to find the optimal set of
20
- hyperparameters for a given model. The test essentially aims to enhance the performance of the model under scrutiny
21
- by determining the best configuration of hyperparameters. The parameters that are being optimized are defined by
22
- the parameter grid that is passed to the metric.
23
-
24
- **Test Mechanism:** The HyperParametersTuning test employs a grid search mechanism using the function GridSearchCV
25
- from the scikit-learn library. The grid search algorithm is exhaustive: it systematically works through multiple
26
- combinations of the parameter tunes, cross-validated to determine which tune gives the best model performance. The
27
- chosen model and the parameters grid that are to be passed for tuning are the required inputs. Once the grid search
28
- is complete, the test caches and returns the details of the best model and its associated parameters.
29
-
30
- **Signs of High Risk:**
31
- - The test raises a SkipTestError if the param_grid is not supplied. This suggests that there are no specific
32
- parameters to optimize, which is a risk in certain model types that rely heavily on parameter tuning.
33
- - Poorly chosen scoring metrics that don't align well with the specific model or problem at hand might also reflect
34
- as a potential risk or failure in achieving the best performance.
35
-
36
- **Strengths:**
37
- - The test is a comprehensive exploratory mechanism that figures out the best set of hyperparameters for the
38
- supplied model, thereby helping improve its performance.
39
- - The implementation of GridSearchCV simplifies and automates the time-consuming task of hyperparameter tuning.
40
-
41
- **Limitations:**
42
- - The grid search algorithm can be computationally expensive, particularly with a large dataset or complex models.
43
- This grid search approach can be time-consuming as it tries out all possible combinations within the specified
44
- parameter grid.
45
- - The suitability of the tuning heavily relies on the quality of the data and it only accepts datasets with
19
+ ### Purpose:
20
+
21
+ The "HyperParametersTuning" metric aims to find the optimal set of hyperparameters for a given model. The test is
22
+ designed to enhance the performance of the model by determining the best configuration of hyperparameters. The
23
+ parameters that are being optimized are defined by the parameter grid provided to the metric.
24
+
25
+ ### Test Mechanism:
26
+
27
+ The HyperParametersTuning test employs a grid search mechanism using the GridSearchCV function from the
28
+ scikit-learn library. The grid search algorithm systematically works through multiple combinations of parameter
29
+ values, cross-validating to determine which combination gives the best model performance. The chosen model and the
30
+ parameter grid passed for tuning are necessary inputs. Once the grid search is complete, the test caches and
31
+ returns details of the best model and its associated parameters.
32
+
33
+ ### Signs of High Risk:
34
+
35
+ - The test raises a SkipTestError if the param_grid is not supplied, indicating a lack of specific parameters to
36
+ optimize, which can be risky for certain model types reliant on parameter tuning.
37
+ - Poorly chosen scoring metrics that do not align well with the specific model or problem at hand could reflect
38
+ potential risks or failures in achieving optimal performance.
39
+
40
+ ### Strengths:
41
+
42
+ - Provides a comprehensive exploration mechanism to identify the best set of hyperparameters for the supplied
43
+ model, thereby enhancing its performance.
44
+ - Implements GridSearchCV, simplifying and automating the time-consuming task of hyperparameter tuning.
45
+
46
+ ### Limitations:
47
+
48
+ - The grid search algorithm can be computationally expensive, especially with large datasets or complex models, and
49
+ can be time-consuming as it tests all possible combinations within the specified parameter grid.
50
+ - The effectiveness of the tuning is heavily dependent on the quality of data and only accepts datasets with
46
51
  numerical or ordered categories.
47
- - The functionality assumes that the same set of hyperparameters is optimal for all problem sets, which may not
48
- hold true in every scenario.
49
- - There is a potential risk of overfitting the model if the training set is not representative of the data the
52
+ - Assumes that the same set of hyperparameters is optimal for all problem sets, which may not be true in every
53
+ scenario.
54
+ - There's a potential risk of overfitting the model if the training set is not representative of the data that the
50
55
  model will be applied to.
51
56
  """
52
57
 
@@ -19,13 +19,15 @@ class KMeansClustersOptimization(Metric):
19
19
  """
20
20
  Optimizes the number of clusters in K-means models using Elbow and Silhouette methods.
21
21
 
22
- **Purpose:**
22
+ ### Purpose
23
+
23
24
  This metric is used to optimize the number of clusters used in K-means clustering models. It intends to measure and
24
25
  evaluate the optimal number of clusters by leveraging two methodologies, namely the Elbow method and the Silhouette
25
26
  method. This is crucial as an inappropriate number of clusters can either overly simplify or overcomplicate the
26
27
  structure of the data, thereby undermining the effectiveness of the model.
27
28
 
28
- **Test Mechanism:**
29
+ ### Test Mechanism
30
+
29
31
  The test mechanism involves iterating over a predefined range of cluster numbers and applying both the Elbow method
30
32
  and the Silhouette method. The Elbow method computes the sum of the minimum euclidean distances between data points
31
33
  and their respective cluster centers (distortion). This value decreases as the number of clusters increases; the
@@ -35,20 +37,23 @@ class KMeansClustersOptimization(Metric):
35
37
  of clusters under this method is the one that maximizes the average silhouette score. The results of both methods
36
38
  are plotted for visual inspection.
37
39
 
38
- **Signs of High Risk:**
40
+ ### Signs of High Risk
41
+
39
42
  - A high distortion value or a low silhouette average score for the optimal number of clusters.
40
43
  - No clear 'elbow' point or plateau observed in the distortion plot, or a uniformly low silhouette average score
41
44
  across different numbers of clusters, suggesting the data is not amenable to clustering.
42
45
  - An optimal cluster number that is unreasonably high or low, suggestive of overfitting or underfitting,
43
46
  respectively.
44
47
 
45
- **Strengths:**
48
+ ### Strengths
49
+
46
50
  - Provides both a visual and quantitative method to determine the optimal number of clusters.
47
51
  - Leverages two different methods (Elbow and Silhouette), thereby affording robustness and versatility in assessing
48
52
  the data's clusterability.
49
53
  - Facilitates improved model performance by allowing for an informed selection of the number of clusters.
50
54
 
51
- **Limitations:**
55
+ ### Limitations
56
+
52
57
  - Assumes that a suitable number of clusters exists in the data, which may not always be true, especially for
53
58
  complex or noisy data.
54
59
  - Both methods may fail to provide definitive answers when the data lacks clear cluster structures.
@@ -22,38 +22,38 @@ class MinimumAccuracy(ThresholdTest):
22
22
  """
23
23
  Checks if the model's prediction accuracy meets or surpasses a specified threshold.
24
24
 
25
- **Purpose**: The Minimum Accuracy test’s objective is to verify whether the model's prediction accuracy on a
26
- specific dataset meets or surpasses a predetermined minimum threshold. Accuracy, which is simply the ratio of right
27
- predictions to total predictions, is a key metric for evaluating the model's performance. Considering binary as
28
- well as multiclass classifications, accurate labeling becomes indispensable.
29
-
30
- **Test Mechanism**: The test mechanism involves contrasting the model's accuracy score with a pre-set minimum
31
- threshold value, default value being 0.7. The accuracy score is computed utilizing sklearn’s `accuracy_score`
32
- method, where the true label `y_true` and predicted label `class_pred` are compared. If the accuracy score is above
33
- the threshold, the test gets a passing mark. The test returns the result along with the accuracy score and
34
- threshold used for the test.
35
-
36
- **Signs of High Risk**:
37
- - The risk level for this test surges considerably when the model is unable to achieve or surpass the predefined
38
- score threshold.
39
- - When the model persistently scores below the threshold, it suggests a high risk of inaccurate predictions, which
40
- in turn affects the model’s efficiency and reliability.
41
-
42
- **Strengths**:
43
- - One of the key strengths of this test is its simplicity, presenting a straightforward measure of the holistic
44
- model performance across all classes.
45
- - This test is particularly advantageous when classes are balanced.
46
- - Another advantage of this test is its versatility as it can be implemented on both binary and multiclass
47
- classification tasks.
48
-
49
- **Limitations**:
50
- - When analyzing imbalanced datasets, certain limitations of this test emerge. The accuracy score can be misleading
51
- when classes in the dataset are skewed considerably.
52
- - This can result in favoritism towards the majority class, consequently giving an inaccurate perception of the
53
- model performance.
54
- - Another limitation is its inability to measure the model's precision, recall, or capacity to manage false
55
- positives or false negatives.
56
- - The test majorly focuses on overall correctness and may not be sufficient for all types of model analytics.
25
+ ### Purpose
26
+
27
+ The Minimum Accuracy test’s objective is to verify whether the model's prediction accuracy on a specific dataset
28
+ meets or surpasses a predetermined minimum threshold. Accuracy, which is simply the ratio of correct predictions to
29
+ total predictions, is a key metric for evaluating the model's performance. Considering binary as well as multiclass
30
+ classifications, accurate labeling becomes indispensable.
31
+
32
+ ### Test Mechanism
33
+
34
+ The test mechanism involves contrasting the model's accuracy score with a preset minimum threshold value, with the
35
+ default being 0.7. The accuracy score is computed utilizing sklearn’s `accuracy_score` method, where the true
36
+ labels `y_true` and predicted labels `class_pred` are compared. If the accuracy score is above the threshold, the
37
+ test receives a passing mark. The test returns the result along with the accuracy score and threshold used for the
38
+ test.
39
+
40
+ ### Signs of High Risk
41
+
42
+ - Model fails to achieve or surpass the predefined score threshold.
43
+ - Persistent scores below the threshold, indicating a high risk of inaccurate predictions.
44
+
45
+ ### Strengths
46
+
47
+ - Simplicity, presenting a straightforward measure of holistic model performance across all classes.
48
+ - Particularly advantageous when classes are balanced.
49
+ - Versatile, as it can be implemented on both binary and multiclass classification tasks.
50
+
51
+ ### Limitations
52
+
53
+ - Misleading accuracy scores when classes in the dataset are highly imbalanced.
54
+ - Favoritism towards the majority class, giving an inaccurate perception of model performance.
55
+ - Inability to measure the model's precision, recall, or capacity to manage false positives or false negatives.
56
+ - Focused on overall correctness and may not be sufficient for all types of model analytics.
57
57
  """
58
58
 
59
59
  name = "accuracy_score"
@@ -21,42 +21,42 @@ from validmind.vm_models import (
21
21
  @dataclass
22
22
  class MinimumF1Score(ThresholdTest):
23
23
  """
24
- Evaluates if the model's F1 score on the validation set meets a predefined minimum threshold.
24
+ Assesses if the model's F1 score on the validation set meets a predefined minimum threshold, ensuring balanced
25
+ performance between precision and recall.
26
+
27
+ ### Purpose
25
28
 
26
- **Purpose:**
27
29
  The main objective of this test is to ensure that the F1 score, a balanced measure of precision and recall, of the
28
30
  model meets or surpasses a predefined threshold on the validation dataset. The F1 score is highly useful for
29
31
  gauging model performance in classification tasks, especially in cases where the distribution of positive and
30
32
  negative classes is skewed.
31
33
 
32
- **Test Mechanism:**
33
- The F1 score for the validation dataset is computed through the scikit-learn's metrics in Python. The scoring
34
- mechanism differs based on the classification problem: for multi-class problems, macro averaging is used (metrics
35
- are calculated separately and their unweighted mean is found), and for binary classification, the built-in f1_score
36
- calculation is used. The obtained F1 score is then assessed against the predefined minimum F1 score that is
37
- expected from the model.
34
+ ### Test Mechanism
35
+
36
+ The F1 score for the validation dataset is computed through scikit-learn's metrics in Python. The scoring mechanism
37
+ differs based on the classification problem: for multi-class problems, macro averaging is used, and for binary
38
+ classification, the built-in `f1_score` calculation is used. The obtained F1 score is then assessed against the
39
+ predefined minimum F1 score that is expected from the model.
38
40
 
39
- **Signs of High Risk:**
41
+ ### Signs of High Risk
40
42
 
41
43
  - If a model returns an F1 score that is less than the established threshold, it is regarded as high risk.
42
- - A low F1 score might suggest that the model is not finding an optimal balance between precision and recall, see:
43
- it isn't successfully identifying positive classes while minimizing false positives.
44
+ - A low F1 score might suggest that the model is not finding an optimal balance between precision and recall,
45
+ failing to effectively identify positive classes while minimizing false positives.
44
46
 
45
- **Strengths:**
47
+ ### Strengths
46
48
 
47
- - This metric gives a balanced measure of a model's performance by accounting for both false positives and false
48
- negatives.
49
- - It has a particular advantage in scenarios with imbalanced class distribution, where an accuracy measure can be
50
- misleading.
51
- - The flexibility of setting the threshold value allows for tailoring the minimum acceptable performance.
49
+ - Provides a balanced measure of a model's performance by accounting for both false positives and false negatives.
50
+ - Particularly advantageous in scenarios with imbalanced class distribution, where accuracy can be misleading.
51
+ - Flexibility in setting the threshold value allows tailored minimum acceptable performance standards.
52
52
 
53
- **Limitations:**
53
+ ### Limitations
54
54
 
55
- - The testing method may not be suitable for all types of models and machine learning tasks.
56
- - Although the F1 score gives a balanced view of a model's performance, it presupposes an equal cost for false
57
- positives and false negatives, which may not always be true in certain real-world scenarios. As a consequence,
58
- practitioners might have to rely on other metrics such as precision, recall, or the ROC-AUC score that align more
59
- closely with their specific requirements.
55
+ - May not be suitable for all types of models and machine learning tasks.
56
+ - The F1 score assumes an equal cost for false positives and false negatives, which may not be true in some
57
+ real-world scenarios.
58
+ - Practitioners might need to rely on other metrics such as precision, recall, or the ROC-AUC score that align more
59
+ closely with specific requirements.
60
60
  """
61
61
 
62
62
  name = "f1_score"
@@ -23,32 +23,37 @@ class MinimumROCAUCScore(ThresholdTest):
23
23
  """
24
24
  Validates model by checking if the ROC AUC score meets or surpasses a specified threshold.
25
25
 
26
- **Purpose**:
27
- This test metric, Minimum ROC AUC Score, is used to determine the model's performance by ensuring that the Receiver
28
- Operating Characteristic Area Under the Curve (ROC AUC) score on the validation dataset meets or exceeds a
29
- predefined threshold. The ROC AUC score is an indicator of how well the model is capable of distinguishing between
30
- different classes, making it a crucial measure in binary and multiclass classification tasks.
26
+ ### Purpose
27
+
28
+ The Minimum ROC AUC Score test is used to determine the model's performance by ensuring that the Receiver Operating
29
+ Characteristic Area Under the Curve (ROC AUC) score on the validation dataset meets or exceeds a predefined
30
+ threshold. The ROC AUC score indicates how well the model can distinguish between different classes, making it a
31
+ crucial measure in binary and multiclass classification tasks.
32
+
33
+ ### Test Mechanism
31
34
 
32
- **Test Mechanism**:
33
35
  This test implementation calculates the multiclass ROC AUC score on the true target values and the model's
34
- prediction. The test converts the multi-class target variables into binary format using `LabelBinarizer` before
36
+ predictions. The test converts the multi-class target variables into binary format using `LabelBinarizer` before
35
37
  computing the score. If this ROC AUC score is higher than the predefined threshold (defaulted to 0.5), the test
36
38
  passes; otherwise, it fails. The results, including the ROC AUC score, the threshold, and whether the test passed
37
39
  or failed, are then stored in a `ThresholdTestResult` object.
38
40
 
39
- **Signs of High Risk**:
41
+ ### Signs of High Risk
42
+
40
43
  - A high risk or failure in the model's performance as related to this metric would be represented by a low ROC AUC
41
44
  score, specifically any score lower than the predefined minimum threshold. This suggests that the model is
42
45
  struggling to distinguish between different classes effectively.
43
46
 
44
- **Strengths**:
47
+ ### Strengths
48
+
45
49
  - The test considers both the true positive rate and false positive rate, providing a comprehensive performance
46
50
  measure.
47
51
  - ROC AUC score is threshold-independent meaning it measures the model's quality across various classification
48
52
  thresholds.
49
53
  - Works robustly with binary as well as multi-class classification problems.
50
54
 
51
- **Limitations**:
55
+ ### Limitations
56
+
52
57
  - ROC AUC may not be useful if the class distribution is highly imbalanced; it could perform well in terms of AUC
53
58
  but still fail to predict the minority class.
54
59
  - The test does not provide insight into what specific aspects of the model are causing poor performance if the ROC
@@ -19,33 +19,40 @@ class ModelsPerformanceComparison(ClassifierPerformance):
19
19
  Evaluates and compares the performance of multiple Machine Learning models using various metrics like accuracy,
20
20
  precision, recall, and F1 score.
21
21
 
22
- **Purpose**: This metric test aims to evaluate and compare the performance of various Machine Learning models using
23
- test data. It employs multiple metrics such as accuracy, precision, recall, and the F1 score, among others, to
24
- assess model performance and assist in selecting the most effective model for the designated task.
25
-
26
- **Test Mechanism**: The test employs Scikit-learn’s performance metrics to evaluate each model's performance for
27
- both binary and multiclass classification tasks. To compare performances, the test runs each model against the test
28
- dataset, then produces a comprehensive classification report. This report includes metrics such as accuracy,
29
- precision, recall, and the F1 score. Based on whether the task at hand is binary or multiclass classification, it
30
- calculates metrics all the classes and their weighted averages, macro averages,
31
- and per class metrics. The test will be skipped if no models are supplied.
32
-
33
- **Signs of High Risk**:
22
+ ### Purpose
23
+
24
+ The Models Performance Comparison test aims to evaluate and compare the performance of various Machine Learning
25
+ models using test data. It employs multiple metrics such as accuracy, precision, recall, and the F1 score, among
26
+ others, to assess model performance and assist in selecting the most effective model for the designated task.
27
+
28
+ ### Test Mechanism
29
+
30
+ The test employs Scikit-learn’s performance metrics to evaluate each model's performance for both binary and
31
+ multiclass classification tasks. To compare performances, the test runs each model against the test dataset, then
32
+ produces a comprehensive classification report. This report includes metrics such as accuracy, precision, recall,
33
+ and the F1 score. Based on whether the task at hand is binary or multiclass classification, it calculates metrics
34
+ for all the classes and their weighted averages, macro averages, and per-class metrics. The test will be skipped if
35
+ no models are supplied.
36
+
37
+ ### Signs of High Risk
38
+
34
39
  - Low scores in accuracy, precision, recall, and F1 metrics indicate a potentially high risk.
35
40
  - A low area under the Receiver Operating Characteristic (ROC) curve (roc_auc score) is another possible indicator
36
41
  of high risk.
37
42
  - If the metrics scores are significantly lower than alternative models, this might suggest a high risk of failure.
38
43
 
39
- **Strengths**:
40
- - The test provides a simple way to compare the performance of multiple models, accommodating both binary and
41
- multiclass classification tasks.
42
- - It provides a holistic view of model performance through a comprehensive report of key performance metrics.
44
+ ### Strengths
45
+
46
+ - Provides a simple way to compare the performance of multiple models, accommodating both binary and multiclass
47
+ classification tasks.
48
+ - Offers a holistic view of model performance through a comprehensive report of key performance metrics.
43
49
  - The inclusion of the ROC AUC score is advantageous, as this robust performance metric can effectively handle
44
50
  class imbalance issues.
45
51
 
46
- **Limitations**:
47
- - This test may not be suitable for more complex performance evaluations that consider factors such as prediction
48
- speed, computational cost, or business-specific constraints.
52
+ ### Limitations
53
+
54
+ - May not be suitable for more complex performance evaluations that consider factors such as prediction speed,
55
+ computational cost, or business-specific constraints.
49
56
  - The test's reliability depends on the provided test dataset; hence, the selected models' performance could vary
50
57
  with unseen data or changes in the data distribution.
51
58
  - The ROC AUC score might not be as meaningful or easily interpretable for multilabel/multiclass tasks.
@@ -187,6 +187,8 @@ def overfit_diagnosis( # noqa: C901
187
187
  feature columns. It calculates the difference between the training and test performance
188
188
  for each group and identifies regions where the difference exceeds a specified threshold.
189
189
 
190
+ ## Test Methodology
191
+
190
192
  This test works for both classification and regression models and with a variety of
191
193
  performance metrics. By default, it uses the AUC metric for classification models and
192
194
  the MSE metric for regression models. The threshold for identifying overfit regions
@@ -308,28 +310,46 @@ def overfit_diagnosis( # noqa: C901
308
310
 
309
311
  @dataclass
310
312
  class OverfitDiagnosis(ThresholdTest):
311
- """Identify overfit regions in a model's predictions.
313
+ """
314
+ Assesses potential overfitting in a model's predictions, identifying regions where performance between training and
315
+ testing sets deviates significantly.
312
316
 
313
- This test compares the model's performance on training versus test data, grouped by
314
- feature columns. It calculates the difference between the training and test performance
315
- for each group and identifies regions where the difference exceeds a specified threshold.
317
+ ### Purpose
316
318
 
317
- This test works for both classification and regression models and with a variety of
318
- performance metrics. By default, it uses the AUC metric for classification models and
319
- the MSE metric for regression models. The threshold for identifying overfit regions
320
- defaults to 0.04 but should be adjusted based on the specific use case.
319
+ The Overfit Diagnosis test aims to identify areas in a model's predictions where there is a significant difference
320
+ in performance between the training and testing sets. This test helps to pinpoint specific regions or feature
321
+ segments where the model may be overfitting.
321
322
 
322
- ## Inputs
323
- - `model` (VMModel): The ValidMind model object to evaluate.
324
- - `datasets` (List[VMDataset]): A list of two VMDataset objects where the first dataset
325
- is the training data and the second dataset is the test data.
323
+ ### Test Mechanism
326
324
 
327
- ## Parameters
328
- - `metric` (str, optional): The performance metric to use for evaluation. Choose from:
329
- 'accuracy', 'auc', 'f1', 'precision', 'recall', 'mse', 'mae', 'r2', 'mape'.
330
- Defaults to 'auc' for classification models and 'mse' for regression models.
331
- - `cut_off_threshold` (float, optional): The threshold for identifying overfit regions.
332
- Defaults to 0.04.
325
+ This test compares the model's performance on training versus test data, grouped by feature columns. It calculates
326
+ the difference between the training and test performance for each group and identifies regions where this
327
+ difference exceeds a specified threshold:
328
+
329
+ - The test works for both classification and regression models.
330
+ - It defaults to using the AUC metric for classification models and the MSE metric for regression models.
331
+ - The threshold for identifying overfitting regions is set to 0.04 by default.
332
+ - The test calculates the performance metrics for each feature segment and plots regions where the performance gap
333
+ exceeds the threshold.
334
+
335
+ ### Signs of High Risk
336
+
337
+ - Significant gaps between training and test performance metrics for specific feature segments.
338
+ - Multiple regions with performance gaps exceeding the defined threshold.
339
+ - Higher than expected differences in predicted versus actual values in the test set compared to the training set.
340
+
341
+ ### Strengths
342
+
343
+ - Identifies specific areas where overfitting occurs.
344
+ - Supports multiple performance metrics, providing flexibility.
345
+ - Applicable to both classification and regression models.
346
+ - Visualization of overfitting segments aids in better understanding and debugging.
347
+
348
+ ### Limitations
349
+
350
+ - The default threshold may not be suitable for all use cases and requires tuning.
351
+ - May not capture more subtle forms of overfitting that do not exceed the threshold.
352
+ - Assumes that the binning of features adequately represents the data segments.
333
353
  """
334
354
 
335
355
  required_inputs = ["model", "datasets"]
@@ -20,34 +20,40 @@ class PermutationFeatureImportance(Metric):
20
20
  Assesses the significance of each feature in a model by evaluating the impact on model performance when feature
21
21
  values are randomly rearranged.
22
22
 
23
- **Purpose**: The purpose of the Permutation Feature Importance (PFI) metric is to assess the importance of each
24
- feature used by the Machine Learning model. The significance is measured by evaluating the decrease in the model's
25
- performance when the feature's values are randomly arranged.
23
+ ### Purpose
26
24
 
27
- **Test Mechanism**: PFI is calculated via the `permutation_importance` method from the `sklearn.inspection` module.
28
- This method shuffles the columns of the feature dataset and measures the impact on the model's performance. A
29
- significant decrease in performance after permutating a feature's values deems the feature as important. On the
30
- other hand, if performance remains the same, the feature is likely not important. The output of the PFI metric is a
31
- figure illustrating the importance of each feature.
25
+ The Permutation Feature Importance (PFI) metric aims to assess the importance of each feature used by the Machine
26
+ Learning model. The significance is measured by evaluating the decrease in the model's performance when the
27
+ feature's values are randomly arranged.
28
+
29
+ ### Test Mechanism
30
+
31
+ PFI is calculated via the `permutation_importance` method from the `sklearn.inspection` module. This method
32
+ shuffles the columns of the feature dataset and measures the impact on the model's performance. A significant
33
+ decrease in performance after permutating a feature's values deems the feature as important. On the other hand, if
34
+ performance remains the same, the feature is likely not important. The output of the PFI metric is a figure
35
+ illustrating the importance of each feature.
36
+
37
+ ### Signs of High Risk
32
38
 
33
- **Signs of High Risk**:
34
39
  - The model heavily relies on a feature with highly variable or easily permutable values, indicating instability.
35
- - A feature, deemed unimportant by the model but based on domain knowledge should have a significant effect on the
36
- outcome, is not influencing the model's predictions.
37
-
38
- **Strengths**:
39
- - PFI provides insights into the importance of different features and may reveal underlying data structure.
40
- - It can indicate overfitting if a particular feature or set of features overly impacts the model's predictions.
41
- - The metric is model-agnostic and can be used with any classifier that provides a measure of prediction accuracy
42
- before and after feature permutation.
43
-
44
- **Limitations**:
45
- - The feature importance calculated does not imply causality, it only presents the amount of information that a
46
- feature provides for the prediction task.
47
- - The metric does not account for interactions between features. If features are correlated, the permutation
48
- importance may allocate importance to one and not the other.
49
- - PFI cannot interact with certain libraries like statsmodels, pytorch, catboost, etc, thus limiting its
50
- applicability.
40
+ - A feature deemed unimportant by the model but expected to have a significant effect on the outcome based on
41
+ domain knowledge is not influencing the model's predictions.
42
+
43
+ ### Strengths
44
+
45
+ - Provides insights into the importance of different features and may reveal underlying data structure.
46
+ - Can indicate overfitting if a particular feature or set of features overly impacts the model's predictions.
47
+ - Model-agnostic and can be used with any classifier that provides a measure of prediction accuracy before and
48
+ after feature permutation.
49
+
50
+ ### Limitations
51
+
52
+ - Does not imply causality; it only presents the amount of information that a feature provides for the prediction
53
+ task.
54
+ - Does not account for interactions between features. If features are correlated, the permutation importance may
55
+ allocate importance to one and not the other.
56
+ - Cannot interact with certain libraries like statsmodels, pytorch, catboost, etc., thus limiting its applicability.
51
57
  """
52
58
 
53
59
  name = "pfi"
@@ -103,7 +109,7 @@ class PermutationFeatureImportance(Metric):
103
109
  )
104
110
  )
105
111
  fig.update_layout(
106
- title_text="Permutation Importances (train set)",
112
+ title_text="Permutation Importances",
107
113
  yaxis=dict(
108
114
  tickmode="linear", # set tick mode to linear
109
115
  dtick=1, # set interval between ticks