validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,15 +2,17 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import plotly.graph_objects as go
8
6
 
9
- from validmind.vm_models import Figure, Metric
7
+ from validmind import tags, tasks
8
+ from validmind.vm_models import VMDataset
10
9
 
11
10
 
12
- @dataclass
13
- class MissingValuesBarPlot(Metric):
11
+ @tags("tabular_data", "data_quality", "visualization")
12
+ @tasks("classification", "regression")
13
+ def MissingValuesBarPlot(
14
+ dataset: VMDataset, threshold: int = 80, fig_height: int = 600
15
+ ):
14
16
  """
15
17
  Assesses the percentage and distribution of missing values in the dataset via a bar plot, with emphasis on
16
18
  identifying high-risk columns based on a user-defined threshold.
@@ -55,90 +57,62 @@ class MissingValuesBarPlot(Metric):
55
57
  - The metric does not consider possible impacts of the missing data on the model's accuracy or precision.
56
58
  - Interpretation of the findings and the next steps might require an expert understanding of the field.
57
59
  """
58
-
59
- name = "missing_values_bar_plot"
60
- required_inputs = ["dataset"]
61
- default_params = {"threshold": 80, "fig_height": 600}
62
- tasks = ["classification", "regression"]
63
- tags = ["tabular_data", "data_quality", "visualization"]
64
-
65
- def run(self):
66
- threshold = self.params["threshold"]
67
- fig_height = self.params["fig_height"]
68
-
69
- figure = self.visualize_missing_values(threshold, fig_height)
70
-
71
- return self.cache_results(figures=figure)
72
-
73
- def visualize_missing_values(self, threshold, fig_height):
74
- # Calculate the percentage of missing values in each column
75
- missing_percentages = (
76
- self.inputs.dataset.df.isnull().sum() / len(self.inputs.dataset.df)
77
- ) * 100
78
-
79
- # Only keep entries where missing_percentage > 0
80
- missing_percentages = missing_percentages[missing_percentages > 0]
81
-
82
- # Sort missing value percentages in ascending order
83
- missing_percentages_sorted = missing_percentages.sort_values(ascending=True)
84
-
85
- # Create lists to store the x and y values for each bar
86
- y_below_threshold = []
87
- x_below_threshold = []
88
- y_above_threshold = []
89
- x_above_threshold = []
90
-
91
- # Iterate through the missing percentages and separate values based on the threshold
92
- for index, value in missing_percentages_sorted.items():
93
- if value < threshold:
94
- y_below_threshold.append(index)
95
- x_below_threshold.append(value)
96
- else:
97
- y_above_threshold.append(index)
98
- x_above_threshold.append(value)
99
-
100
- # Create bar traces for values below and above threshold
101
- trace_below_threshold = go.Bar(
102
- y=y_below_threshold,
103
- x=x_below_threshold,
104
- marker_color="grey",
105
- name="Below Threshold",
106
- orientation="h",
107
- hovertemplate="Column: %{y}<br>Missing Value Percentage: %{x:.2f}%",
108
- )
109
-
110
- trace_above_threshold = go.Bar(
111
- y=y_above_threshold,
112
- x=x_above_threshold,
113
- marker_color="lightcoral",
114
- name="Above Threshold",
115
- orientation="h",
116
- hovertemplate="Column: %{y}<br>Missing Value Percentage: %{x:.2f}%",
117
- )
118
-
119
- # Draw a red line at the specified threshold
120
- threshold_line = go.Scatter(
121
- y=missing_percentages_sorted.index,
122
- x=[threshold] * len(missing_percentages_sorted.index),
123
- mode="lines",
124
- name="Threshold: {}%".format(threshold),
125
- line=dict(color="red", dash="dash"),
126
- )
127
-
128
- # Create a layout
129
- layout = go.Layout(
60
+ # Calculate the percentage of missing values in each column
61
+ missing_percentages = (dataset.df.isnull().sum() / len(dataset.df)) * 100
62
+ # Only keep entries where missing_percentage > 0
63
+ missing_percentages = missing_percentages[missing_percentages > 0]
64
+ # Sort missing value percentages in ascending order
65
+ missing_percentages_sorted = missing_percentages.sort_values(ascending=True)
66
+
67
+ # Create lists to store the x and y values for each bar
68
+ y_below_threshold = []
69
+ x_below_threshold = []
70
+ y_above_threshold = []
71
+ x_above_threshold = []
72
+
73
+ # Iterate through the missing percentages and separate values based on the threshold
74
+ for index, value in missing_percentages_sorted.items():
75
+ if value < threshold:
76
+ y_below_threshold.append(index)
77
+ x_below_threshold.append(value)
78
+ else:
79
+ y_above_threshold.append(index)
80
+ x_above_threshold.append(value)
81
+
82
+ # Create bar traces for values below and above threshold
83
+ trace_below_threshold = go.Bar(
84
+ y=y_below_threshold,
85
+ x=x_below_threshold,
86
+ marker_color="grey",
87
+ name="Below Threshold",
88
+ orientation="h",
89
+ hovertemplate="Column: %{y}<br>Missing Value Percentage: %{x:.2f}%",
90
+ )
91
+ trace_above_threshold = go.Bar(
92
+ y=y_above_threshold,
93
+ x=x_above_threshold,
94
+ marker_color="lightcoral",
95
+ name="Above Threshold",
96
+ orientation="h",
97
+ hovertemplate="Column: %{y}<br>Missing Value Percentage: %{x:.2f}%",
98
+ )
99
+
100
+ # Draw a red line at the specified threshold
101
+ threshold_line = go.Scatter(
102
+ y=missing_percentages_sorted.index,
103
+ x=[threshold] * len(missing_percentages_sorted.index),
104
+ mode="lines",
105
+ name="Threshold: {}%".format(threshold),
106
+ line=dict(color="red", dash="dash"),
107
+ )
108
+
109
+ return go.Figure(
110
+ data=[trace_below_threshold, trace_above_threshold, threshold_line],
111
+ layout=go.Layout(
130
112
  title="Missing Values",
131
113
  yaxis=dict(title="Columns"),
132
114
  xaxis=dict(title="Missing Value Percentage (%)", range=[0, 100]),
133
115
  barmode="stack",
134
116
  height=fig_height,
135
- )
136
-
137
- # Create a Figure object
138
- fig = go.Figure(
139
- data=[trace_below_threshold, trace_above_threshold, threshold_line],
140
- layout=layout,
141
- )
142
-
143
- figure = Figure(for_object=self, key=self.key, figure=fig)
144
- return [figure]
117
+ ),
118
+ )
@@ -2,20 +2,22 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
5
+ import numpy as np
7
6
  import pandas as pd
8
7
  from arch.unitroot import PhillipsPerron
9
8
  from numpy.linalg import LinAlgError
10
9
 
10
+ from validmind import tags, tasks
11
+ from validmind.errors import SkipTestError
11
12
  from validmind.logging import get_logger
12
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
13
+ from validmind.vm_models import VMDataset
13
14
 
14
15
  logger = get_logger(__name__)
15
16
 
16
17
 
17
- @dataclass
18
- class PhillipsPerronArch(Metric):
18
+ @tags("time_series_data", "forecasting", "statistical_test", "unit_root_test")
19
+ @tasks("regression")
20
+ def PhillipsPerronArch(dataset: VMDataset):
19
21
  """
20
22
  Assesses the stationarity of time series data in each feature of the ML model using the Phillips-Perron test.
21
23
 
@@ -55,80 +57,55 @@ class PhillipsPerronArch(Metric):
55
57
  - Non-stationary time series must be converted to stationary series through differencing, potentially leading to
56
58
  loss of important data points.
57
59
  """
60
+ df = dataset.df.dropna()
58
61
 
59
- name = "phillips_perron"
60
- required_inputs = ["dataset"]
61
- tasks = ["regression"]
62
- tags = [
63
- "time_series_data",
64
- "forecasting",
65
- "statistical_test",
66
- "unit_root_test",
67
- ]
68
-
69
- def run(self):
70
- """
71
- Calculates PP metric for each of the dataset features
72
- """
73
- dataset = self.inputs.dataset.df
74
-
75
- # Check if the dataset is a time series
76
- if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
77
- raise ValueError(
78
- "Dataset index must be a datetime or period index for time series analysis."
79
- )
80
-
81
- # Preprocessing: Drop rows with any NaN values
82
- if dataset.isnull().values.any():
83
- logger.warning(
84
- "Dataset contains missing values. Rows with NaNs will be dropped."
85
- )
86
- dataset = dataset.dropna()
87
-
88
- # Convert to numeric and handle non-numeric data
89
- dataset = dataset.apply(pd.to_numeric, errors="coerce")
90
-
91
- # Initialize a list to store Phillips-Perron results
92
- pp_values = []
93
-
94
- for col in dataset.columns:
95
- try:
96
- pp = PhillipsPerron(dataset[col].values)
97
- pp_values.append(
98
- {
99
- "Variable": col,
100
- "stat": pp.stat,
101
- "pvalue": pp.pvalue,
102
- "usedlag": pp.lags,
103
- "nobs": pp.nobs,
104
- }
105
- )
106
- except LinAlgError as e:
107
- logger.error(f"Error processing column '{col}': {e}")
108
- pp_values.append(
109
- {
110
- "Variable": col,
111
- "stat": None,
112
- "pvalue": None,
113
- "usedlag": None,
114
- "nobs": None,
115
- "error": str(e),
116
- }
117
- )
118
-
119
- return self.cache_results({"phillips_perron_results": pp_values})
120
-
121
- def summary(self, metric_value):
122
- """
123
- Build a table for summarizing the Phillips-Perron results
124
- """
125
- pp_results = metric_value["phillips_perron_results"]
62
+ if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
63
+ raise ValueError(
64
+ "Dataset index must be a datetime or period index for time series analysis."
65
+ )
126
66
 
127
- return ResultSummary(
128
- results=[
129
- ResultTable(
130
- data=pp_results,
131
- metadata=ResultTableMetadata(title="Phillips-Perron Test Results"),
67
+ # Filter numeric columns first
68
+ numeric_columns = df.select_dtypes(include=np.number).columns
69
+ if not any(col in numeric_columns for col in dataset.feature_columns):
70
+ raise SkipTestError("No numeric columns found for Phillips-Perron test.")
71
+
72
+ pp_table = []
73
+
74
+ for col in dataset.feature_columns:
75
+ # Skip non-numeric columns
76
+ if col not in numeric_columns:
77
+ logger.warning(f"Skipping non-numeric column: {col}")
78
+ continue
79
+
80
+ try:
81
+ # Drop any NaN values for this column
82
+ series = df[col].dropna()
83
+ if len(series) == 0:
84
+ logger.warning(
85
+ f"Skipping column '{col}': No valid data after dropping NaN values"
132
86
  )
133
- ]
134
- )
87
+ continue
88
+
89
+ pp = PhillipsPerron(series.values)
90
+ pp_table.append(
91
+ {
92
+ "Variable": col,
93
+ "stat": pp.stat,
94
+ "pvalue": pp.pvalue,
95
+ "usedlag": pp.lags,
96
+ "nobs": pp.nobs,
97
+ }
98
+ )
99
+ except LinAlgError as e:
100
+ logger.error(f"Error processing column '{col}': {e}")
101
+ continue
102
+ except Exception as e:
103
+ logger.error(f"Unexpected error processing column '{col}': {e}")
104
+ continue
105
+
106
+ if not pp_table:
107
+ raise SkipTestError("No valid columns found for Phillips-Perron test.")
108
+
109
+ return {
110
+ "Phillips-Perron Test Results": pp_table,
111
+ }
@@ -5,10 +5,44 @@
5
5
  import matplotlib.pyplot as plt
6
6
  import pandas as pd
7
7
 
8
- from validmind.vm_models import Figure, Metric
9
-
10
-
11
- class RollingStatsPlot(Metric):
8
+ from validmind import tags, tasks
9
+ from validmind.errors import SkipTestError
10
+ from validmind.vm_models import VMDataset
11
+
12
+
13
+ def plot_rolling_statistics(df, col, window_size):
14
+ rolling_mean = df[col].rolling(window=window_size).mean()
15
+ rolling_std = df[col].rolling(window=window_size).std()
16
+
17
+ fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
18
+
19
+ ax1.plot(rolling_mean)
20
+ ax1.set_title(
21
+ f"Rolling Mean for {col}",
22
+ fontsize=20,
23
+ weight="bold",
24
+ )
25
+ ax1.set_ylabel("")
26
+ ax1.tick_params(axis="both", labelsize=18)
27
+ ax1.legend()
28
+
29
+ ax2.plot(rolling_std, label="Rolling Standard Deviation", color="orange")
30
+ ax2.set_title(
31
+ f"Rolling STD for {col}",
32
+ fontsize=20,
33
+ weight="bold",
34
+ )
35
+ ax2.set_xlabel("")
36
+ ax2.set_ylabel("")
37
+ ax2.tick_params(axis="both", labelsize=18)
38
+ ax2.legend()
39
+
40
+ return fig
41
+
42
+
43
+ @tags("time_series_data", "visualization", "stationarity")
44
+ @tasks("regression")
45
+ def RollingStatsPlot(dataset: VMDataset, window_size: int = 12):
12
46
  """
13
47
  Evaluates the stationarity of time series data by plotting its rolling mean and standard deviation over a specified
14
48
  window.
@@ -58,81 +92,16 @@ class RollingStatsPlot(Metric):
58
92
  such as through statistical tests. Therefore, the interpretation is subjective and depends heavily on modeler
59
93
  discretion.
60
94
  """
61
-
62
- name = "rolling_stats_plot"
63
- required_inputs = ["dataset"]
64
- default_params = {"window_size": 12}
65
- tasks = ["regression"]
66
- tags = ["time_series_data", "visualization", "stationarity"]
67
-
68
- def plot_rolling_statistics(self, col, window_size=12):
69
- """
70
- Plot rolling mean and rolling standard deviation in different subplots for a given series.
71
- :param series: Pandas Series with time-series data
72
- :param window_size: Window size for the rolling calculations
73
- :param ax1: Axis object for the rolling mean plot
74
- :param ax2: Axis object for the rolling standard deviation plot
75
- """
76
- rolling_mean = (
77
- self.inputs.dataset.df[col].rolling(window=int(window_size)).mean()
78
- )
79
- rolling_std = self.inputs.dataset.df[col].rolling(window=int(window_size)).std()
80
-
81
- # Create a new figure and axis objects
82
- fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
83
-
84
- ax1.plot(rolling_mean)
85
-
86
- ax1.set_title(
87
- f"Rolling Mean for {col}",
88
- fontsize=20,
89
- weight="bold",
90
- )
91
- ax1.set_ylabel("")
92
- ax1.tick_params(axis="both", labelsize=18)
93
- ax1.legend()
94
-
95
- ax2.plot(rolling_std, label="Rolling Standard Deviation", color="orange")
96
- ax2.set_title(
97
- f"Rolling STD for {col}",
98
- fontsize=20,
99
- weight="bold",
100
- )
101
- ax2.set_xlabel("")
102
- ax2.set_ylabel("")
103
- ax2.tick_params(axis="both", labelsize=18)
104
- ax2.legend()
105
-
106
- return fig
107
-
108
- def run(self):
109
- if "window_size" not in self.params:
110
- raise ValueError("Window size must be provided in params")
111
-
112
- # Check if index is datetime
113
- if not pd.api.types.is_datetime64_any_dtype(self.inputs.dataset.df.index):
114
- raise ValueError("Index must be a datetime type")
115
-
116
- window_size = self.params["window_size"]
117
- df = self.inputs.dataset.df.dropna()
118
-
119
- if not set(df.columns).issubset(set(df.columns)):
120
- raise ValueError("Provided 'columns' must exist in the dataset")
121
-
122
- figures = []
123
-
124
- for col in df.columns:
125
- fig = self.plot_rolling_statistics(col, window_size=window_size)
126
-
127
- figures.append(
128
- Figure(
129
- for_object=self,
130
- key=f"{self.key}:{col}",
131
- figure=fig,
132
- )
95
+ if not pd.api.types.is_datetime64_any_dtype(dataset.df.index):
96
+ raise SkipTestError("Index must be a datetime type")
97
+
98
+ return tuple(
99
+ [
100
+ plot_rolling_statistics(
101
+ df=dataset.df.dropna(),
102
+ col=col,
103
+ window_size=window_size,
133
104
  )
134
-
135
- # Do this if you want to prevent the figure from being displayed
136
- plt.close("all")
137
-
138
- return self.cache_results(figures=figures)
105
+ for col in dataset.feature_columns
106
+ ]
107
+ )