validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,15 +2,23 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
5
+ from validmind import tags, tasks
6
+ from validmind.vm_models import VMDataset
6
7
 
7
- import pandas as pd
8
8
 
9
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
9
+ def compute_outliers(series, threshold=1.5):
10
+ Q1 = series.quantile(0.25)
11
+ Q3 = series.quantile(0.75)
12
+ IQR = Q3 - Q1
13
+ lower_bound = Q1 - threshold * IQR
14
+ upper_bound = Q3 + threshold * IQR
10
15
 
16
+ return series[(series < lower_bound) | (series > upper_bound)]
11
17
 
12
- @dataclass
13
- class IQROutliersTable(Metric):
18
+
19
+ @tags("tabular_data", "numerical_data")
20
+ @tasks("classification", "regression")
21
+ def IQROutliersTable(dataset: VMDataset, threshold: float = 1.5):
14
22
  """
15
23
  Determines and summarizes outliers in numerical features using the Interquartile Range method.
16
24
 
@@ -53,80 +61,32 @@ class IQROutliersTable(Metric):
53
61
  - Default thresholds may not be optimal for data with heavy pre-processing, manipulation, or inherently high
54
62
  kurtosis (heavy tails).
55
63
  """
56
-
57
- name = "iqr_outliers_table"
58
- required_inputs = ["dataset"]
59
- default_params = {"threshold": 1.5}
60
- tasks = ["classification", "regression"]
61
- tags = ["tabular_data", "numerical_data"]
62
-
63
- def run(self):
64
-
65
- # Select numerical features
66
- features = self.inputs.dataset.feature_columns_numeric
67
-
68
- # Select non-binary features
69
- features = [
70
- feature
71
- for feature in features
72
- if len(self.inputs.dataset.df[feature].unique()) > 2
73
- ]
74
-
75
- threshold = self.params["threshold"]
76
-
77
- df = self.inputs.dataset.df
78
-
79
- outliers_summary_table = self.detect_and_analyze_outliers(
80
- df, features, threshold
64
+ df = dataset.df
65
+
66
+ outliers_table = []
67
+
68
+ for col in dataset.feature_columns_numeric:
69
+ # Skip binary features
70
+ if len(df[col].unique()) <= 2:
71
+ continue
72
+
73
+ outliers = compute_outliers(df[col], threshold)
74
+ if outliers.empty:
75
+ continue
76
+
77
+ outliers_table.append(
78
+ {
79
+ "Variable": col,
80
+ "Total Count of Outliers": outliers.count(),
81
+ "Mean Value of Variable": df[col].mean(),
82
+ "Minimum Outlier Value": outliers.min(),
83
+ "Outlier Value at 25th Percentile": outliers.quantile(0.25),
84
+ "Outlier Value at 50th Percentile": outliers.median(),
85
+ "Outlier Value at 75th Percentile": outliers.quantile(0.75),
86
+ "Maximum Outlier Value": outliers.max(),
87
+ }
81
88
  )
82
89
 
83
- return self.cache_results(
84
- {"outliers_summary_table": outliers_summary_table.to_dict(orient="records")}
85
- )
86
-
87
- @staticmethod
88
- def compute_outliers(series, threshold=1.5):
89
- Q1 = series.quantile(0.25)
90
- Q3 = series.quantile(0.75)
91
- IQR = Q3 - Q1
92
- lower_bound = Q1 - threshold * IQR
93
- upper_bound = Q3 + threshold * IQR
94
- return series[(series < lower_bound) | (series > upper_bound)]
95
-
96
- def detect_and_analyze_outliers(self, df, features, threshold=1.5):
97
-
98
- outliers_summary = []
99
- for feature in features:
100
- outliers_series = self.compute_outliers(df[feature], threshold)
101
- if not outliers_series.empty:
102
- outliers_summary.append(
103
- {
104
- "Variable": feature,
105
- "Total Count of Outliers": outliers_series.count(),
106
- "Mean Value of Variable": df[feature].mean(),
107
- "Minimum Outlier Value": outliers_series.min(),
108
- "Outlier Value at 25th Percentile": outliers_series.quantile(
109
- 0.25
110
- ),
111
- "Outlier Value at 50th Percentile": outliers_series.median(),
112
- "Outlier Value at 75th Percentile": outliers_series.quantile(
113
- 0.75
114
- ),
115
- "Maximum Outlier Value": outliers_series.max(),
116
- }
117
- )
118
- outliers_summary_table = pd.DataFrame(outliers_summary)
119
- return outliers_summary_table
120
-
121
- def summary(self, metric_value):
122
- outliers_summary_table = pd.DataFrame(metric_value["outliers_summary_table"])
123
- return ResultSummary(
124
- results=[
125
- ResultTable(
126
- data=outliers_summary_table,
127
- metadata=ResultTableMetadata(
128
- title="Summary of Outliers Detected by IQR Method"
129
- ),
130
- ),
131
- ]
132
- )
90
+ return {
91
+ "Summary of Outliers Detected by IQR Method": outliers_table,
92
+ }
@@ -3,17 +3,23 @@
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
5
  import itertools
6
- from dataclasses import dataclass
7
6
 
8
7
  import matplotlib.pyplot as plt
9
8
  import seaborn as sns
10
9
  from sklearn.ensemble import IsolationForest
11
10
 
12
- from validmind.vm_models import Figure, Metric
11
+ from validmind import tags, tasks
12
+ from validmind.vm_models import VMDataset
13
13
 
14
14
 
15
- @dataclass
16
- class IsolationForestOutliers(Metric):
15
+ @tags("tabular_data", "anomaly_detection")
16
+ @tasks("classification")
17
+ def IsolationForestOutliers(
18
+ dataset: VMDataset,
19
+ random_state: int = 0,
20
+ contamination: float = 0.1,
21
+ feature_columns: list = None,
22
+ ):
17
23
  """
18
24
  Detects outliers in a dataset using the Isolation Forest algorithm and visualizes results through scatter plots.
19
25
 
@@ -55,64 +61,36 @@ class IsolationForestOutliers(Metric):
55
61
  - Potential failure in detecting collective anomalies if they behave similarly to normal data
56
62
  - Potential lack of precision in identifying which features contribute most to the anomalous behavior
57
63
  """
58
-
59
- name = "isolation_forest"
60
- default_params = {
61
- "random_state": 0,
62
- "contamination": 0.1,
63
- "features_columns": None,
64
- }
65
- tasks = ["classification"]
66
- tags = ["tabular_data", "anomaly_detection"]
67
-
68
- required_inputs = ["dataset"]
69
-
70
- def run(self):
71
- if self.params["features_columns"] is None:
72
- features_list = self.inputs.dataset.feature_columns_numeric
73
- else:
74
- features_list = self.params["features_columns"]
75
-
76
- # Check if all elements from features_list are present in the feature columns
77
- all_present = all(
78
- elem in self.inputs.dataset.feature_columns for elem in features_list
64
+ if feature_columns and not all(elem in dataset.columns for elem in feature_columns):
65
+ raise ValueError(
66
+ "The list of feature columns provided do not match with training dataset feature columns"
79
67
  )
80
- if not all_present:
81
- raise ValueError(
82
- "The list of feature columns provided do not match with "
83
- + "training dataset feature columns"
84
- )
85
-
86
- dataset = self.inputs.dataset.df[features_list]
87
-
88
- # Training with isolation forest algorithm
89
- clf = IsolationForest(
90
- random_state=self.params["random_state"],
91
- contamination=self.params["contamination"],
68
+
69
+ feature_columns = feature_columns or dataset.feature_columns_numeric
70
+
71
+ df = dataset.df[feature_columns]
72
+
73
+ clf = IsolationForest(
74
+ random_state=random_state,
75
+ contamination=contamination,
76
+ )
77
+ clf.fit(df)
78
+ y_pred = clf.predict(df)
79
+
80
+ figures = []
81
+
82
+ for feature1, feature2 in itertools.combinations(feature_columns, 2):
83
+ fig = plt.figure()
84
+ ax = sns.scatterplot(
85
+ data=df, x=feature1, y=feature2, hue=y_pred, palette="bright"
92
86
  )
93
- clf.fit(dataset)
94
- y_pred = clf.predict(dataset)
95
-
96
- test_figures = []
97
- combination_pairs = list(itertools.combinations(features_list, 2))
98
- for feature1, feature2 in combination_pairs:
99
- fig = plt.figure()
100
- ax = sns.scatterplot(
101
- data=dataset, x=feature1, y=feature2, hue=y_pred, palette="bright"
102
- )
103
- handles, labels = ax.get_legend_handles_labels()
104
- labels = list(map(lambda x: x.replace("-1", "Outliers"), labels))
105
- labels = list(map(lambda x: x.replace("1", "Inliers"), labels))
106
- ax.legend(handles, labels)
107
- # Do this if you want to prevent the figure from being displayed
108
- plt.close("all")
109
-
110
- test_figures.append(
111
- Figure(
112
- for_object=self,
113
- key=f"{self.name}:{feature1}_{feature2}",
114
- figure=fig,
115
- )
116
- )
117
-
118
- return self.cache_results(figures=test_figures)
87
+ handles, labels = ax.get_legend_handles_labels()
88
+ labels = list(map(lambda x: x.replace("-1", "Outliers"), labels))
89
+ labels = list(map(lambda x: x.replace("1", "Inliers"), labels))
90
+ ax.legend(handles, labels)
91
+
92
+ figures.append(fig)
93
+
94
+ plt.close()
95
+
96
+ return tuple(figures)
@@ -2,19 +2,20 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import pandas as pd
8
6
  from statsmodels.tsa.stattools import kpss
9
7
 
8
+ from validmind import tags, tasks
9
+ from validmind.errors import SkipTestError
10
10
  from validmind.logging import get_logger
11
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
11
+ from validmind.vm_models import VMDataset
12
12
 
13
13
  logger = get_logger(__name__)
14
14
 
15
15
 
16
- @dataclass
17
- class KPSS(Metric):
16
+ @tags("time_series_data", "stationarity", "unit_root_test", "statsmodels")
17
+ @tasks("data_validation")
18
+ def KPSS(dataset: VMDataset):
18
19
  """
19
20
  Assesses the stationarity of time-series data in a machine learning model using the KPSS unit root test.
20
21
 
@@ -53,81 +54,32 @@ class KPSS(Metric):
53
54
  - The reliability of the test is contingent on the number of lags selected, which introduces potential bias in the
54
55
  measurement.
55
56
  """
57
+ df = dataset.df.dropna()
56
58
 
57
- name = "kpss"
58
- required_inputs = ["dataset"]
59
- tasks = ["regression"]
60
- tags = [
61
- "time_series_data",
62
- "forecasting",
63
- "stationarity",
64
- "unit_root_test",
65
- "statsmodels",
66
- ]
67
-
68
- def run(self):
69
- """
70
- Calculates KPSS for each of the dataset features
71
- """
72
- dataset = self.inputs.dataset.df
73
-
74
- # Check if the dataset is a time series
75
- if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
76
- raise ValueError(
77
- "Dataset index must be a datetime or period index for time series analysis."
78
- )
79
-
80
- # Preprocessing: Drop rows with any NaN values
81
- if dataset.isnull().values.any():
82
- logger.warning(
83
- "Dataset contains missing values. Rows with NaNs will be dropped."
84
- )
85
- dataset = dataset.dropna()
86
-
87
- # Convert to numeric and handle non-numeric data
88
- dataset = dataset.apply(pd.to_numeric, errors="coerce")
89
-
90
- # Initialize a list to store KPSS results
91
- kpss_values = []
92
-
93
- for col in dataset.columns:
94
- try:
95
- kpss_stat, pvalue, usedlag, critical_values = kpss(dataset[col].values)
96
- kpss_values.append(
97
- {
98
- "Variable": col,
99
- "stat": kpss_stat,
100
- "pvalue": pvalue,
101
- "usedlag": usedlag,
102
- "critical_values": critical_values,
103
- }
104
- )
105
- except Exception as e:
106
- logger.error(f"Error processing column '{col}': {e}")
107
- kpss_values.append(
108
- {
109
- "Variable": col,
110
- "stat": None,
111
- "pvalue": None,
112
- "usedlag": None,
113
- "critical_values": None,
114
- "error": str(e),
115
- }
116
- )
117
-
118
- return self.cache_results({"kpss_results": kpss_values})
119
-
120
- def summary(self, metric_value):
121
- """
122
- Build a table for summarizing the KPSS results
123
- """
124
- kpss_results = metric_value["kpss_results"]
125
-
126
- return ResultSummary(
127
- results=[
128
- ResultTable(
129
- data=kpss_results,
130
- metadata=ResultTableMetadata(title="KPSS Test Results"),
131
- )
132
- ]
59
+ if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
60
+ raise SkipTestError(
61
+ "Dataset index must be a datetime or period index for time series analysis."
133
62
  )
63
+
64
+ df = df.apply(pd.to_numeric, errors="coerce")
65
+
66
+ kpss_table = []
67
+
68
+ for col in dataset.columns:
69
+ kpss_stat, pvalue, usedlag, critical_values = kpss(df[col].values)
70
+ kpss_table.append(
71
+ {
72
+ "Variable": col,
73
+ "stat": kpss_stat,
74
+ "pvalue": pvalue,
75
+ "usedlag": usedlag,
76
+ "critical_values": critical_values,
77
+ }
78
+ )
79
+
80
+ if not kpss_table:
81
+ raise SkipTestError(f"No KPSS results found for dataset: {dataset.input_id}")
82
+
83
+ return {
84
+ "KPSS Test Results": kpss_table,
85
+ }
@@ -6,13 +6,16 @@ import numpy as np
6
6
  import pandas as pd
7
7
  import plotly.figure_factory as ff
8
8
 
9
- from validmind.vm_models import Figure, Metric
9
+ from validmind import tags, tasks
10
+ from validmind.vm_models import VMDataset
10
11
 
11
12
  # Define the 'coolwarm' color scale manually
12
13
  COOLWARM = [[0, "rgb(95,5,255)"], [0.5, "rgb(255,255,255)"], [1, "rgb(255,5,0)"]]
13
14
 
14
15
 
15
- class LaggedCorrelationHeatmap(Metric):
16
+ @tags("time_series_data", "visualization")
17
+ @tasks("regression")
18
+ def LaggedCorrelationHeatmap(dataset: VMDataset, num_lags: int = 10):
16
19
  """
17
20
  Assesses and visualizes correlation between target variable and lagged independent variables in a time-series
18
21
  dataset.
@@ -55,98 +58,47 @@ class LaggedCorrelationHeatmap(Metric):
55
58
  to interpret, while too few might overlook delayed effects.
56
59
  - This metric does not take into account any causal relationships, but merely demonstrates correlation.
57
60
  """
58
-
59
- name = "lagged_correlation_heatmap"
60
- required_inputs = ["dataset"]
61
- tasks = ["regression"]
62
- tags = ["time_series_data", "visualization"]
63
-
64
- def _compute_correlations(self, df, target_col, independent_vars, num_lags):
65
- correlations = np.zeros((len(independent_vars), num_lags + 1))
66
-
67
- for i, ind_var_col in enumerate(independent_vars):
68
- for lag in range(num_lags + 1):
69
- temp_df = pd.DataFrame(
70
- {
71
- target_col: df[target_col],
72
- f"{ind_var_col}_lag{lag}": df[ind_var_col].shift(lag),
73
- }
74
- )
75
-
76
- temp_df = temp_df.dropna()
77
-
78
- corr = temp_df[target_col].corr(temp_df[f"{ind_var_col}_lag{lag}"])
79
-
80
- correlations[i, lag] = corr
81
-
82
- return correlations
83
-
84
- def _plot_heatmap(self, correlations, independent_vars, target_col, num_lags):
85
- correlation_df = pd.DataFrame(
86
- correlations,
87
- columns=[f"{i}" for i in range(num_lags + 1)],
88
- index=independent_vars,
89
- )
90
-
91
- # Create heatmap using Plotly
92
- fig = ff.create_annotated_heatmap(
93
- z=correlation_df.values,
94
- x=list(correlation_df.columns),
95
- y=list(correlation_df.index),
96
- colorscale=COOLWARM,
97
- annotation_text=correlation_df.round(2).values,
98
- showscale=True,
99
- )
100
-
101
- fig.update_layout(
102
- title={
103
- "text": f"Correlations between {target_col} and Lags of Features",
104
- "y": 0.95,
105
- "x": 0.5,
106
- "xanchor": "center",
107
- "yanchor": "top",
108
- },
109
- font=dict(size=14),
110
- xaxis_title="Lags",
111
- )
112
-
113
- return fig
114
-
115
- def run(self):
116
- if isinstance(self.inputs.dataset.target_column, list):
117
- target_col = self.inputs.dataset.target_column[
118
- 0
119
- ] # take the first item from the list
120
- else:
121
- target_col = self.inputs.dataset.target_column
122
-
123
- independent_vars = list(self.inputs.dataset.feature_columns)
124
- num_lags = self.params.get("num_lags", 10)
125
-
126
- if isinstance(target_col, list) and len(target_col) == 1:
127
- target_col = target_col[0]
128
-
129
- if not isinstance(target_col, str):
130
- raise ValueError(
131
- "The 'target_col' must be a single string or a list containing a single string"
132
- )
133
-
134
- df = self.inputs.dataset.df
135
-
136
- correlations = self._compute_correlations(
137
- df, target_col, independent_vars, num_lags
138
- )
139
- fig = self._plot_heatmap(correlations, independent_vars, target_col, num_lags)
140
-
141
- figures = []
142
- figures.append(
143
- Figure(
144
- for_object=self,
145
- key=self.key,
146
- figure=fig,
61
+ correlations = np.zeros((len(dataset.feature_columns), num_lags + 1))
62
+
63
+ for i, ind_var_col in enumerate(dataset.feature_columns):
64
+ for lag in range(num_lags + 1):
65
+ temp_df = pd.DataFrame(
66
+ {
67
+ dataset.target_column: dataset.df[dataset.target_column],
68
+ f"{ind_var_col}_lag{lag}": dataset.df[ind_var_col].shift(lag),
69
+ }
70
+ ).dropna()
71
+
72
+ corr = temp_df[dataset.target_column].corr(
73
+ temp_df[f"{ind_var_col}_lag{lag}"]
147
74
  )
148
- )
149
75
 
150
- return self.cache_results(
151
- figures=figures,
152
- )
76
+ correlations[i, lag] = corr
77
+
78
+ correlation_df = pd.DataFrame(
79
+ correlations,
80
+ columns=[f"{i}" for i in range(num_lags + 1)],
81
+ index=dataset.feature_columns,
82
+ )
83
+
84
+ fig = ff.create_annotated_heatmap(
85
+ z=correlation_df.values,
86
+ x=list(correlation_df.columns),
87
+ y=list(correlation_df.index),
88
+ colorscale=COOLWARM,
89
+ annotation_text=correlation_df.round(2).values,
90
+ showscale=True,
91
+ )
92
+ fig.update_layout(
93
+ title={
94
+ "text": f"Correlations between {dataset.target_column} and Lags of Features",
95
+ "y": 0.95,
96
+ "x": 0.5,
97
+ "xanchor": "center",
98
+ "yanchor": "top",
99
+ },
100
+ font=dict(size=14),
101
+ xaxis_title="Lags",
102
+ )
103
+
104
+ return fig
@@ -2,20 +2,13 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from typing import List
5
+ from validmind import tags, tasks
6
+ from validmind.vm_models import VMDataset
7
7
 
8
- from validmind.vm_models import (
9
- ResultSummary,
10
- ResultTable,
11
- ResultTableMetadata,
12
- ThresholdTest,
13
- ThresholdTestResult,
14
- )
15
8
 
16
-
17
- @dataclass
18
- class MissingValues(ThresholdTest):
9
+ @tags("tabular_data", "data_quality")
10
+ @tasks("classification", "regression")
11
+ def MissingValues(dataset: VMDataset, min_threshold: int = 1):
19
12
  """
20
13
  Evaluates dataset quality by ensuring missing value ratio across all features does not exceed a set threshold.
21
14
 
@@ -53,49 +46,15 @@ class MissingValues(ThresholdTest):
53
46
  - Does not account for data encoded as values like "-999" or "None," which might not technically classify as
54
47
  missing but could bear similar implications.
55
48
  """
56
-
57
- name = "missing"
58
- required_inputs = ["dataset"]
59
- default_params = {"min_threshold": 1}
60
- tasks = ["classification", "regression"]
61
- tags = ["tabular_data", "data_quality"]
62
-
63
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
64
- """
65
- The missing values test returns results like these:
66
- [{"values": {"n_missing": 0, "p_missing": 0.0}, "column": "Exited", "passed": true}]
67
- """
68
- results_table = [
69
- {
70
- "Column": result.column,
71
- "Number of Missing Values": result.values["n_missing"],
72
- "Percentage of Missing Values (%)": result.values["p_missing"] * 100,
73
- "Pass/Fail": "Pass" if result.passed else "Fail",
74
- }
75
- for result in results
76
- ]
77
- return ResultSummary(
78
- results=[
79
- ResultTable(
80
- data=results_table,
81
- metadata=ResultTableMetadata(
82
- title="Missing Values Results for Dataset"
83
- ),
84
- )
85
- ]
86
- )
87
-
88
- def run(self):
89
- rows = self.inputs.dataset.df.shape[0]
90
-
91
- missing = self.inputs.dataset.df.isna().sum()
92
- results = [
93
- ThresholdTestResult(
94
- column=col,
95
- passed=missing[col] < self.params["min_threshold"],
96
- values={"n_missing": missing[col], "p_missing": missing[col] / rows},
97
- )
98
- for col in missing.index
99
- ]
100
-
101
- return self.cache_results(results, passed=all([r.passed for r in results]))
49
+ df = dataset.df
50
+ missing = df.isna().sum()
51
+
52
+ return [
53
+ {
54
+ "Column": col,
55
+ "Number of Missing Values": missing[col],
56
+ "Percentage of Missing Values (%)": missing[col] / df.shape[0] * 100,
57
+ "Pass/Fail": "Pass" if missing[col] < min_threshold else "Fail",
58
+ }
59
+ for col in missing.index
60
+ ], all(missing[col] < min_threshold for col in missing.index)