validmind 2.5.6__py3-none-any.whl → 2.5.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +26 -7
  3. validmind/api_client.py +89 -43
  4. validmind/client.py +2 -2
  5. validmind/client_config.py +11 -14
  6. validmind/datasets/regression/fred_timeseries.py +67 -138
  7. validmind/template.py +1 -0
  8. validmind/test_suites/__init__.py +0 -2
  9. validmind/test_suites/statsmodels_timeseries.py +1 -1
  10. validmind/test_suites/summarization.py +0 -1
  11. validmind/test_suites/time_series.py +0 -43
  12. validmind/tests/__types__.py +3 -13
  13. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  14. validmind/tests/data_validation/ADF.py +31 -24
  15. validmind/tests/data_validation/AutoAR.py +9 -9
  16. validmind/tests/data_validation/AutoMA.py +23 -16
  17. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  18. validmind/tests/data_validation/AutoStationarity.py +21 -16
  19. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  20. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
  21. validmind/tests/data_validation/ClassImbalance.py +15 -12
  22. validmind/tests/data_validation/DFGLSArch.py +19 -13
  23. validmind/tests/data_validation/DatasetDescription.py +17 -11
  24. validmind/tests/data_validation/DatasetSplit.py +7 -5
  25. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  26. validmind/tests/data_validation/Duplicates.py +33 -25
  27. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  28. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  29. validmind/tests/data_validation/HighCardinality.py +19 -12
  30. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  31. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  32. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  33. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  34. validmind/tests/data_validation/KPSS.py +34 -29
  35. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  36. validmind/tests/data_validation/MissingValues.py +32 -27
  37. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  38. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  39. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  40. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  41. validmind/tests/data_validation/ScatterPlot.py +63 -78
  42. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  43. validmind/tests/data_validation/Skewness.py +35 -37
  44. validmind/tests/data_validation/SpreadPlot.py +35 -35
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  47. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  48. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  49. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  50. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  51. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  52. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  53. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  54. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  55. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  56. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  57. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  58. validmind/tests/data_validation/UniqueRows.py +11 -6
  59. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  60. validmind/tests/data_validation/WOEBinTable.py +35 -30
  61. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  62. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  63. validmind/tests/data_validation/nlp/Hashtags.py +27 -20
  64. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  65. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  66. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  67. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  68. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  69. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  70. validmind/tests/data_validation/nlp/TextDescription.py +36 -35
  71. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  72. validmind/tests/decorator.py +81 -42
  73. validmind/tests/model_validation/BertScore.py +36 -27
  74. validmind/tests/model_validation/BleuScore.py +25 -19
  75. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  76. validmind/tests/model_validation/ContextualRecall.py +35 -13
  77. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  78. validmind/tests/model_validation/MeteorScore.py +46 -33
  79. validmind/tests/model_validation/ModelMetadata.py +32 -64
  80. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  81. validmind/tests/model_validation/RegardScore.py +30 -14
  82. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  83. validmind/tests/model_validation/RougeScore.py +36 -30
  84. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  85. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  86. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  87. validmind/tests/model_validation/TokenDisparity.py +31 -23
  88. validmind/tests/model_validation/ToxicityScore.py +26 -17
  89. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  90. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  91. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  92. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  93. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  94. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  95. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  96. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  97. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  98. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  99. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  100. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  101. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  102. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  103. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  104. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  105. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  106. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  107. validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
  108. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  109. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  110. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  111. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  112. validmind/tests/model_validation/ragas/utils.py +6 -0
  113. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  114. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  115. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  116. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  117. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  118. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  119. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  120. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  121. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  122. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  123. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  124. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  125. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  126. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  127. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  128. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  129. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  130. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  131. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
  132. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  133. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  134. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  135. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  136. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  137. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  138. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
  139. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  140. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +113 -73
  141. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
  142. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  143. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  144. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  145. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  146. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  147. validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
  148. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  149. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
  150. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  151. validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
  152. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  153. validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
  154. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  155. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
  156. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  157. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  158. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  159. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  160. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  161. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  162. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  163. validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
  164. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  165. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
  166. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  167. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  168. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  169. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  170. validmind/tests/prompt_validation/Bias.py +14 -11
  171. validmind/tests/prompt_validation/Clarity.py +16 -14
  172. validmind/tests/prompt_validation/Conciseness.py +7 -5
  173. validmind/tests/prompt_validation/Delimitation.py +23 -22
  174. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  175. validmind/tests/prompt_validation/Robustness.py +12 -10
  176. validmind/tests/prompt_validation/Specificity.py +13 -11
  177. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  178. validmind/tests/run.py +68 -23
  179. validmind/unit_metrics/__init__.py +81 -144
  180. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  181. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  182. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  183. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  184. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  185. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  186. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  187. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  188. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  189. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  190. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  191. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  192. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  193. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  194. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  195. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  196. validmind/vm_models/dataset/dataset.py +2 -0
  197. validmind/vm_models/figure.py +5 -0
  198. validmind/vm_models/test/result_wrapper.py +93 -132
  199. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
  200. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
  201. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  202. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  203. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  204. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  205. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  206. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  207. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  208. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  209. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  210. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
  211. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
  212. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -2,34 +2,37 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import numpy as np
8
6
  import pandas as pd
9
7
  from sklearn.metrics import roc_auc_score, roc_curve
10
8
 
11
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
9
+ from validmind import tags, tasks
12
10
 
13
11
 
14
- @dataclass
15
- class GINITable(Metric):
12
+ @tags("model_performance")
13
+ @tasks("classification")
14
+ def GINITable(dataset, model):
16
15
  """
17
16
  Evaluates classification model performance using AUC, GINI, and KS metrics for training and test datasets.
18
17
 
19
- **Purpose**: The 'GINITable' metric is designed to evaluate the performance of a classification model by
20
- emphasizing its discriminatory power. Specifically, it calculates and presents three important metrics - the Area
21
- under the ROC Curve (AUC), the GINI coefficient, and the Kolmogov-Smirnov (KS) statistic - for both training and
22
- test datasets.
18
+ ### Purpose
19
+
20
+ The 'GINITable' metric is designed to evaluate the performance of a classification model by emphasizing its
21
+ discriminatory power. Specifically, it calculates and presents three important metrics - the Area under the ROC
22
+ Curve (AUC), the GINI coefficient, and the Kolmogorov-Smirnov (KS) statistic - for both training and test datasets.
23
+
24
+ ### Test Mechanism
23
25
 
24
- **Test Mechanism**: Using a dictionary for storing performance metrics for both the training and test datasets, the
25
- 'GINITable' metric calculates each of these metrics sequentially. The Area under the ROC Curve (AUC) is calculated
26
- via the `roc_auc_score` function from the Scikit-Learn library. The GINI coefficient, a measure of statistical
27
- dispersion, is then computed by doubling the AUC and subtracting 1. Finally, the Kolmogov-Smirnov (KS) statistic is
26
+ Using a dictionary for storing performance metrics for both the training and test datasets, the 'GINITable' metric
27
+ calculates each of these metrics sequentially. The Area under the ROC Curve (AUC) is calculated via the
28
+ `roc_auc_score` function from the Scikit-Learn library. The GINI coefficient, a measure of statistical dispersion,
29
+ is then computed by doubling the AUC and subtracting 1. Finally, the Kolmogorov-Smirnov (KS) statistic is
28
30
  calculated via the `roc_curve` function from Scikit-Learn, with the False Positive Rate (FPR) subtracted from the
29
31
  True Positive Rate (TPR) and the maximum value taken from the resulting data. These metrics are then stored in a
30
32
  pandas DataFrame for convenient visualization.
31
33
 
32
- **Signs of High Risk**:
34
+ ### Signs of High Risk
35
+
33
36
  - Low values for performance metrics may suggest a reduction in model performance, particularly a low AUC which
34
37
  indicates poor classification performance, or a low GINI coefficient, which could suggest a decreased ability to
35
38
  discriminate different classes.
@@ -38,7 +41,8 @@ class GINITable(Metric):
38
41
  - Significant discrepancies between the performance on the training dataset and the test dataset may present
39
42
  another signal of high risk.
40
43
 
41
- **Strengths**:
44
+ ### Strengths
45
+
42
46
  - Offers three key performance metrics (AUC, GINI, and KS) in one test, providing a more comprehensive evaluation
43
47
  of the model.
44
48
  - Provides a direct comparison between the model's performance on training and testing datasets, which aids in
@@ -47,7 +51,8 @@ class GINITable(Metric):
47
51
  performance even when dealing with imbalanced datasets.
48
52
  - Presents the metrics in a user-friendly table format for easy comprehension and analysis.
49
53
 
50
- **Limitations**:
54
+ ### Limitations
55
+
51
56
  - The GINI coefficient and KS statistic are both dependent on the AUC value. Therefore, any errors in the
52
57
  calculation of the latter will adversely impact the former metrics too.
53
58
  - Mainly suited for binary classification models and may require modifications for effective application in
@@ -57,64 +62,26 @@ class GINITable(Metric):
57
62
  lead to inaccuracies in the metrics if the data is not appropriately preprocessed.
58
63
  """
59
64
 
60
- name = "gini_table"
61
- required_inputs = ["model", "datasets"]
62
- tasks = ["classification"]
63
- tags = ["visualization", "model_performance"]
64
-
65
- def run(self):
66
-
67
- summary_metrics = self.compute_metrics()
68
-
69
- return self.cache_results(
70
- {
71
- "metrics_summary": summary_metrics.to_dict(orient="records"),
72
- }
73
- )
74
-
75
- def compute_metrics(self):
76
- """Computes AUC, GINI, and KS for an arbitrary number of datasets."""
77
- # Initialize the dictionary to store results
78
- metrics_dict = {"Dataset": [], "AUC": [], "GINI": [], "KS": []}
79
-
80
- # Iterate over each dataset in the inputs
81
- for _, dataset in enumerate(self.inputs.datasets):
82
- dataset_label = (
83
- dataset.input_id
84
- ) # Use input_id as the label for each dataset
85
- metrics_dict["Dataset"].append(dataset_label)
86
-
87
- # Retrieve y_true and y_pred for the current dataset
88
- y_true = np.ravel(dataset.y) # Flatten y_true to make it one-dimensional
89
- y_prob = dataset.y_prob(self.inputs.model)
90
-
91
- # Compute metrics
92
- y_true = np.array(y_true, dtype=float)
93
- y_prob = np.array(y_prob, dtype=float)
94
-
95
- fpr, tpr, _ = roc_curve(y_true, y_prob)
96
- ks = max(tpr - fpr)
97
- auc = roc_auc_score(y_true, y_prob)
98
- gini = 2 * auc - 1
99
-
100
- # Add the metrics to the dictionary
101
- metrics_dict["AUC"].append(auc)
102
- metrics_dict["GINI"].append(gini)
103
- metrics_dict["KS"].append(ks)
104
-
105
- # Create a DataFrame to store and return the results
106
- metrics_df = pd.DataFrame(metrics_dict)
107
- return metrics_df
108
-
109
- def summary(self, metric_value):
110
- summary_metrics_table = metric_value["metrics_summary"]
111
- return ResultSummary(
112
- results=[
113
- ResultTable(
114
- data=summary_metrics_table,
115
- metadata=ResultTableMetadata(
116
- title="AUC, GINI and KS for train and test datasets"
117
- ),
118
- )
119
- ]
120
- )
65
+ metrics_dict = {"AUC": [], "GINI": [], "KS": []}
66
+
67
+ # Retrieve y_true and y_pred for the current dataset
68
+ y_true = np.ravel(dataset.y) # Flatten y_true to make it one-dimensional
69
+ y_prob = dataset.y_prob(model)
70
+
71
+ # Compute metrics
72
+ y_true = np.array(y_true, dtype=float)
73
+ y_prob = np.array(y_prob, dtype=float)
74
+
75
+ fpr, tpr, _ = roc_curve(y_true, y_prob)
76
+ ks = max(tpr - fpr)
77
+ auc = roc_auc_score(y_true, y_prob)
78
+ gini = 2 * auc - 1
79
+
80
+ # Add the metrics to the dictionary
81
+ metrics_dict["AUC"].append(auc)
82
+ metrics_dict["GINI"].append(gini)
83
+ metrics_dict["KS"].append(ks)
84
+
85
+ # Create a DataFrame to store and return the results
86
+ metrics_df = pd.DataFrame(metrics_dict)
87
+ return metrics_df
@@ -11,36 +11,41 @@ class JarqueBera(Metric):
11
11
  """
12
12
  Assesses normality of dataset features in an ML model using the Jarque-Bera test.
13
13
 
14
- **Purpose**: The purpose of the Jarque-Bera test as implemented in this metric is to determine if the features in
15
- the dataset of a given Machine Learning model follows a normal distribution. This is crucial for understanding the
16
- distribution and behavior of the model's features, as numerous statistical methods assume normal distribution of
17
- the data.
18
-
19
- **Test Mechanism**: The test mechanism involves computing the Jarque-Bera statistic, p-value, skew, and kurtosis
20
- for each feature in the dataset. It utilizes the 'jarque_bera' function from the 'statsmodels' library in Python,
21
- storing the results in a dictionary. The test evaluates the skewness and kurtosis to ascertain whether the dataset
22
- follows a normal distribution. A significant p-value (typically less than 0.05) implies that the data does not
23
- possess normal distribution.
24
-
25
- **Signs of High Risk**:
26
- - A high Jarque-Bera statistic and a low p-value (usually less than 0.05) indicates high-risk conditions.
14
+ ### Purpose
15
+
16
+ The purpose of the Jarque-Bera test as implemented in this metric is to determine if the features in the dataset of
17
+ a given Machine Learning model follow a normal distribution. This is crucial for understanding the distribution and
18
+ behavior of the model's features, as numerous statistical methods assume normal distribution of the data.
19
+
20
+ ### Test Mechanism
21
+
22
+ The test mechanism involves computing the Jarque-Bera statistic, p-value, skew, and kurtosis for each feature in
23
+ the dataset. It utilizes the 'jarque_bera' function from the 'statsmodels' library in Python, storing the results
24
+ in a dictionary. The test evaluates the skewness and kurtosis to ascertain whether the dataset follows a normal
25
+ distribution. A significant p-value (typically less than 0.05) implies that the data does not possess normal
26
+ distribution.
27
+
28
+ ### Signs of High Risk
29
+
30
+ - A high Jarque-Bera statistic and a low p-value (usually less than 0.05) indicate high-risk conditions.
27
31
  - Such results suggest the data significantly deviates from a normal distribution. If a machine learning model
28
32
  expects feature data to be normally distributed, these findings imply that it may not function as intended.
29
33
 
30
- **Strengths**:
31
- - This test provides insights into the shape of the data distribution, helping determine whether a given set of
32
- data follows a normal distribution.
33
- - This is particularly useful for risk assessment for models that assume a normal distribution of data.
34
+ ### Strengths
35
+
36
+ - Provides insights into the shape of the data distribution, helping determine whether a given set of data follows
37
+ a normal distribution.
38
+ - Particularly useful for risk assessment for models that assume a normal distribution of data.
34
39
  - By measuring skewness and kurtosis, it provides additional insights into the nature and magnitude of a
35
40
  distribution's deviation.
36
41
 
37
- **Limitations**:
38
- - The Jarque-Bera test only checks for normality in the data distribution. It cannot provide insights into other
39
- types of distributions.
42
+ ### Limitations
43
+
44
+ - Only checks for normality in the data distribution. It cannot provide insights into other types of distributions.
40
45
  - Datasets that aren't normally distributed but follow some other distribution might lead to inaccurate risk
41
46
  assessments.
42
- - The test is highly sensitive to large sample sizes, often rejecting the null hypothesis (that data is normally
43
- distributed) even for minor deviations in larger datasets.
47
+ - Highly sensitive to large sample sizes, often rejecting the null hypothesis (that data is normally distributed)
48
+ even for minor deviations in larger datasets.
44
49
  """
45
50
 
46
51
  name = "jarque_bera"
@@ -13,40 +13,39 @@ from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableM
13
13
  @dataclass
14
14
  class KolmogorovSmirnov(Metric):
15
15
  """
16
- Executes a feature-wise Kolmogorov-Smirnov test to evaluate alignment with normal distribution in datasets.
17
-
18
- **Purpose**: This metric employs the Kolmogorov-Smirnov (KS) test to evaluate the distribution of a dataset's
19
- features. It specifically gauges whether the data from each feature aligns with a normal distribution, a common
20
- presumption in many statistical methods and machine learning models.
21
-
22
- **Test Mechanism**: This KS test calculates the KS statistic and the corresponding p-value for each column in a
23
- dataset. It achieves this by contrasting the cumulative distribution function of the dataset's feature with an
24
- ideal normal distribution. Subsequently, a feature-by-feature KS statistic and p-value are stored in a dictionary.
25
- The specific threshold p-value (the value below which we reject the hypothesis that the data is drawn from a normal
26
- distribution) is not firmly set within this implementation, allowing for definitional flexibility depending on the
27
- specific application.
28
-
29
- **Signs of High Risk**:
30
- - Elevated KS statistic for a feature combined with a low p-value. This suggests a significant divergence between
31
- the feature's distribution and a normal one.
32
- - Features with notable deviations. These could create problems if the applicable model makes assumptions about
33
- normal data distribution, thereby representing a risk.
34
-
35
- **Strengths**:
36
- - The KS test is acutely sensitive to differences in the location and shape of the empirical cumulative
37
- distribution functions of two samples.
38
- - It is non-parametric and does not presuppose any specific data distribution, making it adaptable to a range of
39
- datasets.
40
- - With its focus on individual features, it offers detailed insights into data distribution.
41
-
42
- **Limitations**:
43
- - The sensitivity of the KS test to disparities in data distribution tails can be excessive. Such sensitivity might
44
- prompt false alarms about non-normal distributions, particularly in situations where these tail tendencies are
45
- irrelevant to the model.
46
- - It could become less effective when applied to multivariate distributions, considering that it's primarily
47
- configured for univariate distributions.
48
- - As a goodness-of-fit test, the KS test does not identify specific types of non-normality, such as skewness or
49
- kurtosis, that could directly impact model fitting.
16
+ Assesses whether each feature in the dataset aligns with a normal distribution using the Kolmogorov-Smirnov test.
17
+
18
+ ### Purpose
19
+
20
+ The Kolmogorov-Smirnov (KS) test evaluates the distribution of features in a dataset to determine their alignment
21
+ with a normal distribution. This is important because many statistical methods and machine learning models assume
22
+ normality in the data distribution.
23
+
24
+ ### Test Mechanism
25
+
26
+ This test calculates the KS statistic and corresponding p-value for each feature in the dataset. It does so by
27
+ comparing the cumulative distribution function of the feature with an ideal normal distribution. The KS statistic
28
+ and p-value for each feature are then stored in a dictionary. The p-value threshold to reject the normal
29
+ distribution hypothesis is not preset, providing flexibility for different applications.
30
+
31
+ ### Signs of High Risk
32
+
33
+ - Elevated KS statistic for a feature combined with a low p-value, indicating a significant divergence from a
34
+ normal distribution.
35
+ - Features with notable deviations that could create problems if the model assumes normality in data distribution.
36
+
37
+ ### Strengths
38
+
39
+ - The KS test is sensitive to differences in the location and shape of empirical cumulative distribution functions.
40
+ - It is non-parametric and adaptable to various datasets, as it does not assume any specific data distribution.
41
+ - Provides detailed insights into the distribution of individual features.
42
+
43
+ ### Limitations
44
+
45
+ - The test's sensitivity to disparities in the tails of data distribution might cause false alarms about
46
+ non-normality.
47
+ - Less effective for multivariate distributions, as it is designed for univariate distributions.
48
+ - Does not identify specific types of non-normality, such as skewness or kurtosis, which could impact model fitting.
50
49
  """
51
50
 
52
51
  name = "kolmogorov_smirnov"
@@ -11,36 +11,40 @@ class LJungBox(Metric):
11
11
  """
12
12
  Assesses autocorrelations in dataset features by performing a Ljung-Box test on each feature.
13
13
 
14
- **Purpose**: The Ljung-Box test is a type of statistical test utilized to ascertain whether there are
15
- autocorrelations within a given dataset that differ significantly from zero. In the context of a machine learning
16
- model, this test is primarily used to evaluate data utilized in regression tasks, especially those involving time
17
- series and forecasting.
18
-
19
- **Test Mechanism**: The test operates by iterating over each feature within the training dataset and applying the
20
- `acorr_ljungbox` function from the `statsmodels.stats.diagnostic` library. This function calculates the Ljung-Box
21
- statistic and p-value for each feature. These results are then stored in a dictionary where the keys are the
22
- feature names and the values are dictionaries containing the statistic and p-value respectively. Generally, a lower
23
- p-value indicates a higher likelihood of significant autocorrelations within the feature.
24
-
25
- **Signs of High Risk**:
26
- - A high risk or failure in the model's performance relating to this test might be indicated by high Ljung-Box
27
- statistic values or low p-values.
28
- - These outcomes suggest the presence of significant autocorrelations in the respective features. If not properly
29
- consider or handle in the machine learning model, these can negatively affect model performance or bias.
30
-
31
- **Strengths**:
32
- - The Ljung-Box test is a powerful tool for detecting autocorrelations within datasets, especially in time series
33
- data.
34
- - It provides quantitative measures (statistic and p-value) that allow for precise evaluation of autocorrelation.
35
- - This test can be instrumental in avoiding issues related to autoregressive residuals and other challenges in
36
- regression models.
37
-
38
- **Limitations**:
39
- - The Ljung-Box test cannot detect all types of non-linearity or complex interrelationships among variables.
14
+ ### Purpose
15
+
16
+ The Ljung-Box test is a type of statistical test utilized to ascertain whether there are autocorrelations within a
17
+ given dataset that differ significantly from zero. In the context of a machine learning model, this test is
18
+ primarily used to evaluate data utilized in regression tasks, especially those involving time series and
19
+ forecasting.
20
+
21
+ ### Test Mechanism
22
+
23
+ The test operates by iterating over each feature within the training dataset and applying the `acorr_ljungbox`
24
+ function from the `statsmodels.stats.diagnostic` library. This function calculates the Ljung-Box statistic and
25
+ p-value for each feature. These results are then stored in a dictionary where the keys are the feature names and
26
+ the values are dictionaries containing the statistic and p-value respectively. Generally, a lower p-value indicates
27
+ a higher likelihood of significant autocorrelations within the feature.
28
+
29
+ ### Signs of High Risk
30
+
31
+ - High Ljung-Box statistic values or low p-values.
32
+ - Presence of significant autocorrelations in the respective features.
33
+ - Potential for negative impact on model performance or bias if autocorrelations are not properly handled.
34
+
35
+ ### Strengths
36
+
37
+ - Powerful tool for detecting autocorrelations within datasets, especially in time series data.
38
+ - Provides quantitative measures (statistic and p-value) for precise evaluation.
39
+ - Helps avoid issues related to autoregressive residuals and other challenges in regression models.
40
+
41
+ ### Limitations
42
+
43
+ - Cannot detect all types of non-linearity or complex interrelationships among variables.
40
44
  - Testing individual features may not fully encapsulate the dynamics of the data if features interact with each
41
45
  other.
42
- - It is designed more for traditional statistical models and may not be fully compatible with certain types of
43
- complex machine learning models.
46
+ - Designed more for traditional statistical models and may not be fully compatible with certain types of complex
47
+ machine learning models.
44
48
  """
45
49
 
46
50
  name = "ljung_box"
@@ -14,44 +14,47 @@ class Lilliefors(Metric):
14
14
  """
15
15
  Assesses the normality of feature distributions in an ML model's training dataset using the Lilliefors test.
16
16
 
17
- **Purpose**: The purpose of this metric is to utilize the Lilliefors test, named in honor of the Swedish
18
- statistician Hubert Lilliefors, in order to assess whether the features of the machine learning model's training
19
- dataset conform to a normal distribution. This is done because the assumption of normal distribution plays a vital
20
- role in numerous statistical procedures as well as numerous machine learning models. Should the features fail to
21
- follow a normal distribution, some model types may not operate at optimal efficiency. This can potentially lead to
22
- inaccurate predictions.
23
-
24
- **Test Mechanism**: The application of this test happens across all feature columns within the training dataset.
25
- For each feature, the Lilliefors test returns a test statistic and p-value. The test statistic quantifies how far
26
- the feature's distribution is from an ideal normal distribution, whereas the p-value aids in determining the
27
- statistical relevance of this deviation. The final results are stored within a dictionary, the keys of which
28
- correspond to the name of the feature column, and the values being another dictionary which houses the test
29
- statistic and p-value.
30
-
31
- **Signs of High Risk**:
17
+ ### Purpose
18
+
19
+ The purpose of this metric is to utilize the Lilliefors test, named in honor of the Swedish statistician Hubert
20
+ Lilliefors, in order to assess whether the features of the machine learning model's training dataset conform to a
21
+ normal distribution. This is done because the assumption of normal distribution plays a vital role in numerous
22
+ statistical procedures as well as numerous machine learning models. Should the features fail to follow a normal
23
+ distribution, some model types may not operate at optimal efficiency. This can potentially lead to inaccurate
24
+ predictions.
25
+
26
+ ### Test Mechanism
27
+
28
+ The application of this test happens across all feature columns within the training dataset. For each feature, the
29
+ Lilliefors test returns a test statistic and p-value. The test statistic quantifies how far the feature's
30
+ distribution is from an ideal normal distribution, whereas the p-value aids in determining the statistical
31
+ relevance of this deviation. The final results are stored within a dictionary, the keys of which correspond to the
32
+ name of the feature column, and the values being another dictionary which houses the test statistic and p-value.
33
+
34
+ ### Signs of High Risk
32
35
 
33
36
  - If the p-value corresponding to a specific feature sinks below a pre-established significance level, generally
34
37
  set at 0.05, then it can be deduced that the distribution of that feature significantly deviates from a normal
35
38
  distribution. This can present a high risk for models that assume normality, as these models may perform
36
39
  inaccurately or inefficiently in the presence of such a feature.
37
40
 
38
- **Strengths**:
41
+ ### Strengths
39
42
 
40
43
  - One advantage of the Lilliefors test is its utility irrespective of whether the mean and variance of the normal
41
44
  distribution are known in advance. This makes it a more robust option in real-world situations where these values
42
45
  might not be known.
43
- - Second, the test has the ability to screen every feature column, offering a holistic view of the dataset.
46
+ - The test has the ability to screen every feature column, offering a holistic view of the dataset.
44
47
 
45
- **Limitations**:
48
+ ### Limitations
46
49
 
47
50
  - Despite the practical applications of the Lilliefors test in validating normality, it does come with some
48
51
  limitations.
49
- - Firstly, it is only capable of testing unidimensional data, thus rendering it ineffective for datasets with
50
- interactions between features or multi-dimensional phenomena.
51
- - Additionally, the test might not be as sensitive as some other tests (like the Anderson-Darling test) in
52
- detecting deviations from a normal distribution.
53
- - Lastly, like any other statistical test, Lilliefors test may also produce false positives or negatives. Hence,
54
- banking solely on this test, without considering other characteristics of the data, may give rise to risks.
52
+ - It is only capable of testing unidimensional data, thus rendering it ineffective for datasets with interactions
53
+ between features or multi-dimensional phenomena.
54
+ - The test might not be as sensitive as some other tests (like the Anderson-Darling test) in detecting deviations
55
+ from a normal distribution.
56
+ - Like any other statistical test, Lilliefors test may also produce false positives or negatives. Hence, banking
57
+ solely on this test, without considering other characteristics of the data, may give rise to risks.
55
58
  """
56
59
 
57
60
  name = "lilliefors_test"