validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.24.dist-info/METADATA +0 -118
  196. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -5,10 +5,14 @@
5
5
  import pandas as pd
6
6
  from statsmodels.tsa.stattools import coint
7
7
 
8
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
8
+ from validmind import tags, tasks
9
+ from validmind.errors import SkipTestError
10
+ from validmind.vm_models import VMDataset
9
11
 
10
12
 
11
- class EngleGrangerCoint(Metric):
13
+ @tags("time_series_data", "statistical_test", "forecasting")
14
+ @tasks("regression")
15
+ def EngleGrangerCoint(dataset: VMDataset, threshold: float = 0.05):
12
16
  """
13
17
  Assesses the degree of co-movement between pairs of time series data using the Engle-Granger cointegration test.
14
18
 
@@ -48,80 +52,54 @@ class EngleGrangerCoint(Metric):
48
52
  - May not perform well for small sample sizes due to lack of statistical power and should be supplemented with
49
53
  other predictive indicators for a more robust model evaluation.
50
54
  """
55
+ df = dataset.df
51
56
 
52
- type = "dataset"
53
- name = "engle_granger_coint"
54
- required_inputs = ["dataset"]
55
- default_params = {"threshold": 0.05}
56
- tasks = ["regression"]
57
- tags = ["time_series_data", "statistical_test", "forecasting"]
58
-
59
- def run(self):
60
- threshold = self.params["threshold"]
61
- df = self.inputs.dataset.df.dropna()
62
-
63
- # Create an empty DataFrame to store the results
64
- summary_cointegration = pd.DataFrame()
65
-
66
- columns = df.columns
67
- num_vars = len(columns)
68
-
69
- for i in range(num_vars):
70
- for j in range(i + 1, num_vars):
71
- var1 = columns[i]
72
- var2 = columns[j]
73
-
74
- # Perform the Engle-Granger cointegration test
75
- _, p_value, _ = coint(df[var1], df[var2])
76
-
77
- # Determine the decision based on the p-value and the significance level
78
- decision = (
79
- "Cointegrated" if p_value <= threshold else "Not cointegrated"
80
- )
81
- pass_fail = "Pass" if p_value <= threshold else "Fail"
82
-
83
- # Append the result of each test directly into the DataFrame
84
- summary_cointegration = pd.concat(
85
- [
86
- summary_cointegration,
87
- pd.DataFrame(
88
- [
89
- {
90
- "Variable 1": var1,
91
- "Variable 2": var2,
92
- "Test": "Engle-Granger",
93
- "p-value": p_value,
94
- "Threshold": threshold,
95
- "Pass/Fail": pass_fail,
96
- "Decision": decision,
97
- }
98
- ]
99
- ),
100
- ],
101
- ignore_index=True,
102
- )
103
-
104
- return self.cache_results(
105
- {
106
- "cointegration_analysis": summary_cointegration.to_dict(
107
- orient="records"
108
- ),
109
- }
57
+ # Validate that the index is datetime
58
+ if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
59
+ raise SkipTestError(
60
+ "Dataset index must be a datetime or period index for cointegration analysis."
110
61
  )
111
62
 
112
- def summary(self, metric_value):
113
- """
114
- Build one table for summarizing the cointegration results
115
- """
116
- summary_cointegration = metric_value["cointegration_analysis"]
117
-
118
- return ResultSummary(
119
- results=[
120
- ResultTable(
121
- data=summary_cointegration,
122
- metadata=ResultTableMetadata(
123
- title="Cointegration Analysis Results"
63
+ df = dataset.df.dropna()
64
+
65
+ summary_cointegration = pd.DataFrame()
66
+
67
+ columns = df.columns
68
+ num_vars = len(columns)
69
+
70
+ for i in range(num_vars):
71
+ for j in range(i + 1, num_vars):
72
+ var1 = columns[i]
73
+ var2 = columns[j]
74
+
75
+ # Perform the Engle-Granger cointegration test
76
+ _, p_value, _ = coint(df[var1], df[var2])
77
+
78
+ # Determine the decision based on the p-value and the significance level
79
+ decision = "Cointegrated" if p_value <= threshold else "Not cointegrated"
80
+ pass_fail = "Pass" if p_value <= threshold else "Fail"
81
+
82
+ # Append the result of each test directly into the DataFrame
83
+ summary_cointegration = pd.concat(
84
+ [
85
+ summary_cointegration,
86
+ pd.DataFrame(
87
+ [
88
+ {
89
+ "Variable 1": var1,
90
+ "Variable 2": var2,
91
+ "Test": "Engle-Granger",
92
+ "p-value": p_value,
93
+ "Threshold": threshold,
94
+ "Pass/Fail": pass_fail,
95
+ "Decision": decision,
96
+ }
97
+ ]
124
98
  ),
125
- ),
126
- ]
127
- )
99
+ ],
100
+ ignore_index=True,
101
+ )
102
+
103
+ return {
104
+ "Cointegration Analysis Results": summary_cointegration,
105
+ }
@@ -2,23 +2,18 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from typing import List
7
-
8
- from ydata_profiling.config import Settings
9
- from ydata_profiling.model.typeset import ProfilingTypeSet
10
-
11
- from validmind.vm_models import (
12
- ResultSummary,
13
- ResultTable,
14
- ResultTableMetadata,
15
- ThresholdTest,
16
- ThresholdTestResult,
17
- )
18
-
19
-
20
- @dataclass
21
- class HighCardinality(ThresholdTest):
5
+ from validmind import tags, tasks
6
+ from validmind.vm_models import VMDataset
7
+
8
+
9
+ @tags("tabular_data", "data_quality", "categorical_data")
10
+ @tasks("classification", "regression")
11
+ def HighCardinality(
12
+ dataset: VMDataset,
13
+ num_threshold: int = 100,
14
+ percent_threshold: float = 0.1,
15
+ threshold_type: str = "percent",
16
+ ):
22
17
  """
23
18
  Assesses the number of unique values in categorical columns to detect high cardinality and potential overfitting.
24
19
 
@@ -56,72 +51,29 @@ class HighCardinality(ThresholdTest):
56
51
  - The threshold (both number and percent) used for the test is static and may not be optimal for diverse datasets
57
52
  and varied applications. Further mechanisms to adjust and refine this threshold could enhance its effectiveness.
58
53
  """
54
+ df = dataset.df
59
55
 
60
- name = "cardinality"
61
- required_inputs = ["dataset"]
62
- default_params = {
63
- "num_threshold": 100,
64
- "percent_threshold": 0.1,
65
- "threshold_type": "percent", # or "num"
66
- }
67
- tasks = ["classification", "regression"]
68
- tags = ["tabular_data", "data_quality", "categorical_data"]
69
-
70
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
71
- """
72
- The high cardinality test returns results like these:
73
- [{"values": {"n_distinct": 0, "p_distinct": 0.0}, "column": "Exited", "passed": true}]
74
- """
75
- results_table = [
76
- {
77
- "Column": result.column,
78
- "Number of Distinct Values": result.values["n_distinct"],
79
- "Percentage of Distinct Values (%)": result.values["p_distinct"] * 100,
80
- "Pass/Fail": "Pass" if result.passed else "Fail",
81
- }
82
- for result in results
83
- ]
84
- return ResultSummary(
85
- results=[
86
- ResultTable(
87
- data=results_table,
88
- metadata=ResultTableMetadata(
89
- title="High Cardinality Results for Dataset"
90
- ),
91
- )
92
- ]
93
- )
56
+ if threshold_type == "percent":
57
+ num_threshold = int(percent_threshold * df.shape[0])
94
58
 
95
- def run(self):
96
- typeset = ProfilingTypeSet(Settings())
97
- dataset_types = typeset.infer_type(self.inputs.dataset.df)
59
+ table = []
60
+ all_passed = True
98
61
 
99
- results = []
100
- rows = self.inputs.dataset.df.shape[0]
62
+ for col in dataset.feature_columns_categorical:
63
+ n_distinct = df[col].nunique()
64
+ p_distinct = n_distinct / df.shape[0]
65
+ passed = n_distinct < num_threshold
101
66
 
102
- num_threshold = self.params["num_threshold"]
103
- if self.params["threshold_type"] == "percent":
104
- num_threshold = int(self.params["percent_threshold"] * rows)
105
-
106
- for col in self.inputs.dataset.df.columns:
107
- # Only calculate high cardinality for categorical columns
108
- if str(dataset_types[col]) != "Categorical":
109
- continue
110
-
111
- n_distinct = self.inputs.dataset.df[col].nunique()
112
- p_distinct = n_distinct / rows
113
-
114
- passed = n_distinct < num_threshold
67
+ table.append(
68
+ {
69
+ "Column": col,
70
+ "Number of Distinct Values": n_distinct,
71
+ "Percentage of Distinct Values (%)": p_distinct * 100,
72
+ "Pass/Fail": "Pass" if passed else "Fail",
73
+ }
74
+ )
115
75
 
116
- results.append(
117
- ThresholdTestResult(
118
- column=col,
119
- passed=passed,
120
- values={
121
- "n_distinct": n_distinct,
122
- "p_distinct": p_distinct,
123
- },
124
- )
125
- )
76
+ if not passed:
77
+ all_passed = False
126
78
 
127
- return self.cache_results(results, passed=all([r.passed for r in results]))
79
+ return table, all_passed
@@ -2,23 +2,15 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from typing import List
5
+ from validmind import tags, tasks
6
+ from validmind.vm_models import VMDataset
7
7
 
8
- import numpy as np
9
- import pandas as pd
10
8
 
11
- from validmind.vm_models import (
12
- ResultSummary,
13
- ResultTable,
14
- ResultTableMetadata,
15
- ThresholdTest,
16
- ThresholdTestResult,
17
- )
18
-
19
-
20
- @dataclass
21
- class HighPearsonCorrelation(ThresholdTest):
9
+ @tags("tabular_data", "data_quality", "correlation")
10
+ @tasks("classification", "regression")
11
+ def HighPearsonCorrelation(
12
+ dataset: VMDataset, max_threshold: float = 0.3, top_n_correlations: int = 10
13
+ ):
22
14
  """
23
15
  Identifies highly correlated feature pairs in a dataset suggesting feature redundancy or multicollinearity.
24
16
 
@@ -33,8 +25,9 @@ class HighPearsonCorrelation(ThresholdTest):
33
25
 
34
26
  The test works by generating pairwise Pearson correlations for all features in the dataset, then sorting and
35
27
  eliminating duplicate and self-correlations. It assigns a Pass or Fail based on whether the absolute value of the
36
- correlation coefficient surpasses a pre-set threshold (defaulted at 0.3). It lastly returns the top ten strongest
37
- correlations regardless of passing or failing status.
28
+ correlation coefficient surpasses a pre-set threshold (defaulted at 0.3). It lastly returns the top n strongest
29
+ correlations regardless of passing or failing status (where n is 10 by default but can be configured by passing the
30
+ `top_n_correlations` parameter).
38
31
 
39
32
  ### Signs of High Risk
40
33
 
@@ -57,86 +50,25 @@ class HighPearsonCorrelation(ThresholdTest):
57
50
  - Sensitive to outliers where a few outliers could notably affect the correlation coefficient.
58
51
  - Limited to identifying redundancy only within feature pairs; may fail to spot more complex relationships among
59
52
  three or more variables.
60
- - The top 10 result filter might not fully capture the richness of the data; an option to configure the number of
61
- retained results could be helpful.
62
53
  """
63
-
64
- name = "pearson_correlation"
65
- required_inputs = ["dataset"]
66
- default_params = {"max_threshold": 0.3}
67
- tasks = ["classification", "regression"]
68
- tags = ["tabular_data", "data_quality", "correlation"]
69
-
70
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
71
- """The high pearson correlation test returns results like these:
72
- [
73
- {
74
- "values": {
75
- "correlations": [
76
- {"column": "NumOfProducts", "correlation": -0.3044645622389459}
77
- ]
78
- },
79
- "column": "Balance",
80
- "passed": false,
81
- }
82
- ]
83
- """
84
- results_table = [
85
- {
86
- "Columns": f'({result.column}, {result.values["correlations"][0]["column"]})',
87
- "Coefficient": result.values["correlations"][0]["correlation"],
88
- "Pass/Fail": "Pass" if result.passed else "Fail",
89
- }
90
- for result in results
91
- ]
92
- return ResultSummary(
93
- results=[
94
- ResultTable(
95
- data=results_table,
96
- metadata=ResultTableMetadata(
97
- title="High Pearson Correlation Results for Dataset"
98
- ),
99
- )
100
- ]
101
- )
102
-
103
- def run(self):
104
- corr = self.inputs.dataset.df.corr(numeric_only=True)
105
-
106
- # Create a table of correlation coefficients and column pairs
107
- corr_table = corr.unstack().sort_values(
108
- kind="quicksort", key=abs, ascending=False
109
- )
110
- corr_df = pd.DataFrame(corr_table).reset_index()
111
- corr_df.columns = ["Column1", "Column2", "Coefficient"]
112
-
113
- # Remove duplicate correlations and self-correlations
114
- corr_df = corr_df.loc[corr_df["Column1"] < corr_df["Column2"]]
115
-
116
- # Assign Pass/Fail based on correlation coefficient
117
- corr_df["Pass/Fail"] = np.where(
118
- corr_df["Coefficient"].abs() <= self.params["max_threshold"], "Pass", "Fail"
119
- )
120
-
121
- # Only keep the top 10 correlations. TODO: configurable
122
- corr_df = corr_df.head(10)
123
-
124
- passed = corr_df["Pass/Fail"].eq("Pass").all()
125
-
126
- results = [
127
- ThresholdTestResult(
128
- column=col1,
129
- values={
130
- "correlations": [
131
- {
132
- "column": col2,
133
- "correlation": coeff,
134
- }
135
- ]
136
- },
137
- passed=pass_fail == "Pass",
54
+ # Get correlation matrix for numeric columns
55
+ corr = dataset.df.corr(numeric_only=True)
56
+
57
+ # Create table of correlation coefficients and column pairs
58
+ pairs = []
59
+ for i in range(len(corr.columns)):
60
+ for j in range(i + 1, len(corr.columns)):
61
+ coeff = corr.iloc[i, j]
62
+ pairs.append(
63
+ {
64
+ "Columns": f"({corr.columns[i]}, {corr.columns[j]})",
65
+ "Coefficient": coeff,
66
+ "Pass/Fail": "Pass" if abs(coeff) <= max_threshold else "Fail",
67
+ }
138
68
  )
139
- for _, (col1, col2, coeff, pass_fail) in corr_df.iterrows()
140
- ]
141
69
 
142
- return self.cache_results(results, passed=passed)
70
+ # Sort by absolute coefficient and get top N
71
+ pairs.sort(key=lambda x: abs(x["Coefficient"]), reverse=True)
72
+ pairs = pairs[:top_n_correlations]
73
+
74
+ return pairs, all(p["Pass/Fail"] == "Pass" for p in pairs)
@@ -2,15 +2,27 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import plotly.graph_objects as go
8
6
 
9
- from validmind.vm_models import Figure, Metric
7
+ from validmind import tags, tasks
8
+ from validmind.vm_models import VMDataset
9
+
10
+
11
+ def compute_outliers(series, threshold):
12
+ Q1 = series.quantile(0.25)
13
+ Q3 = series.quantile(0.75)
14
+ IQR = Q3 - Q1
15
+ lower_bound = Q1 - threshold * IQR
16
+ upper_bound = Q3 + threshold * IQR
17
+
18
+ return series[(series < lower_bound) | (series > upper_bound)]
10
19
 
11
20
 
12
- @dataclass
13
- class IQROutliersBarPlot(Metric):
21
+ @tags("tabular_data", "visualization", "numerical_data")
22
+ @tasks("classification", "regression")
23
+ def IQROutliersBarPlot(
24
+ dataset: VMDataset, threshold: float = 1.5, fig_width: int = 800
25
+ ):
14
26
  """
15
27
  Visualizes outlier distribution across percentiles in numerical data using the Interquartile Range (IQR) method.
16
28
 
@@ -54,99 +66,56 @@ class IQROutliersBarPlot(Metric):
54
66
  ### Limitations
55
67
 
56
68
  - Its application is limited to numerical variables and does not extend to categorical ones.
57
- - Relies on a predefined threshold (default being 1.5) for outlier identification, which may not be suitable for
58
- all cases.
59
69
  - Only reveals the presence and distribution of outliers and does not provide insights into how these outliers
60
70
  might affect the model's predictive performance.
61
71
  - The assumption that data is unimodal and symmetric may not always hold true. In cases with non-normal
62
72
  distributions, the results can be misleading.
63
73
  """
64
-
65
- name = "iqr_outliers_bar_plot"
66
- required_inputs = ["dataset"]
67
- default_params = {"threshold": 1.5, "fig_width": 800}
68
- tasks = ["classification", "regression"]
69
- tags = ["tabular_data", "visualization", "numerical_data"]
70
-
71
- def run(self):
72
- df = self.inputs.dataset.df
73
-
74
- # Select numerical features
75
- features = self.inputs.dataset.feature_columns_numeric
76
-
77
- # Select non-binary features
78
- features = [
79
- feature
80
- for feature in features
81
- if len(self.inputs.dataset.df[feature].unique()) > 2
74
+ df = dataset.df
75
+
76
+ figures = []
77
+
78
+ for col in dataset.feature_columns_numeric:
79
+ # Skip binary features
80
+ if len(df[col].unique()) <= 2:
81
+ continue
82
+
83
+ outliers = compute_outliers(df[col], threshold)
84
+ if outliers.empty:
85
+ continue
86
+
87
+ Q1_count = outliers[
88
+ (outliers >= 0) & (outliers < outliers.quantile(0.25))
89
+ ].count()
90
+ Q2_count = outliers[
91
+ (outliers >= outliers.quantile(0.25)) & (outliers < outliers.median())
92
+ ].count()
93
+ Q3_count = outliers[
94
+ (outliers >= outliers.median()) & (outliers < outliers.quantile(0.75))
95
+ ].count()
96
+ Q4_count = outliers[
97
+ (outliers >= outliers.quantile(0.75)) & (outliers <= outliers.max())
98
+ ].count()
99
+
100
+ bar_data = [Q1_count, Q2_count, Q3_count, Q4_count]
101
+ percentile_labels = [
102
+ "0-25",
103
+ "25-50",
104
+ "50-75",
105
+ "75-100",
82
106
  ]
83
107
 
84
- threshold = self.params["threshold"]
85
- fig_width = self.params["fig_width"]
86
-
87
- df = df[features]
88
-
89
- return self.detect_and_visualize_outliers(df, threshold, fig_width)
90
-
91
- @staticmethod
92
- def compute_outliers(series, threshold=1.5):
93
- Q1 = series.quantile(0.25)
94
- Q3 = series.quantile(0.75)
95
- IQR = Q3 - Q1
96
- lower_bound = Q1 - threshold * IQR
97
- upper_bound = Q3 + threshold * IQR
98
- return series[(series < lower_bound) | (series > upper_bound)]
99
-
100
- def detect_and_visualize_outliers(self, df, threshold, fig_width):
101
- num_cols = df.columns.tolist()
102
- figures = []
103
-
104
- for col in num_cols:
105
- # Compute outliers
106
- outliers = self.compute_outliers(df[col], threshold)
107
-
108
- if outliers.empty:
109
- continue # Skip plotting if there are no outliers
110
-
111
- Q1_count = outliers[
112
- (outliers >= 0) & (outliers < outliers.quantile(0.25))
113
- ].count()
114
- Q2_count = outliers[
115
- (outliers >= outliers.quantile(0.25)) & (outliers < outliers.median())
116
- ].count()
117
- Q3_count = outliers[
118
- (outliers >= outliers.median()) & (outliers < outliers.quantile(0.75))
119
- ].count()
120
- Q4_count = outliers[
121
- (outliers >= outliers.quantile(0.75)) & (outliers <= outliers.max())
122
- ].count()
123
-
124
- # Prepare data for bar plot
125
- bar_data = [Q1_count, Q2_count, Q3_count, Q4_count]
126
- percentile_labels = [
127
- "0-25",
128
- "25-50",
129
- "50-75",
130
- "75-100",
131
- ]
132
-
133
- # Create a bar plot
134
- fig = go.Figure(
135
- data=[go.Bar(x=percentile_labels, y=bar_data, marker_color="skyblue")]
136
- )
137
-
138
- # Set layout properties
139
- fig.update_layout(
140
- title_text=col,
141
- width=fig_width,
142
- height=400,
143
- plot_bgcolor="white",
144
- xaxis_title="Percentile",
145
- yaxis_title="Outlier Count",
146
- )
147
-
148
- # Create a Figure object and append to figures list
149
- figure = Figure(for_object=self, key=f"{self.key}:{col}", figure=fig)
150
- figures.append(figure)
151
-
152
- return self.cache_results(figures=figures)
108
+ fig = go.Figure(
109
+ data=[go.Bar(x=percentile_labels, y=bar_data, marker_color="skyblue")]
110
+ )
111
+ fig.update_layout(
112
+ title_text=col,
113
+ width=fig_width,
114
+ height=400,
115
+ plot_bgcolor="white",
116
+ xaxis_title="Percentile",
117
+ yaxis_title="Outlier Count",
118
+ )
119
+ figures.append(fig)
120
+
121
+ return tuple(figures)