validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.8.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -2,15 +2,17 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import plotly.graph_objects as go
|
8
6
|
|
9
|
-
from validmind
|
7
|
+
from validmind import tags, tasks
|
8
|
+
from validmind.vm_models import VMDataset
|
10
9
|
|
11
10
|
|
12
|
-
@
|
13
|
-
|
11
|
+
@tags("tabular_data", "data_quality", "visualization")
|
12
|
+
@tasks("classification", "regression")
|
13
|
+
def MissingValuesBarPlot(
|
14
|
+
dataset: VMDataset, threshold: int = 80, fig_height: int = 600
|
15
|
+
):
|
14
16
|
"""
|
15
17
|
Assesses the percentage and distribution of missing values in the dataset via a bar plot, with emphasis on
|
16
18
|
identifying high-risk columns based on a user-defined threshold.
|
@@ -55,90 +57,62 @@ class MissingValuesBarPlot(Metric):
|
|
55
57
|
- The metric does not consider possible impacts of the missing data on the model's accuracy or precision.
|
56
58
|
- Interpretation of the findings and the next steps might require an expert understanding of the field.
|
57
59
|
"""
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
trace_above_threshold = go.Bar(
|
111
|
-
y=y_above_threshold,
|
112
|
-
x=x_above_threshold,
|
113
|
-
marker_color="lightcoral",
|
114
|
-
name="Above Threshold",
|
115
|
-
orientation="h",
|
116
|
-
hovertemplate="Column: %{y}<br>Missing Value Percentage: %{x:.2f}%",
|
117
|
-
)
|
118
|
-
|
119
|
-
# Draw a red line at the specified threshold
|
120
|
-
threshold_line = go.Scatter(
|
121
|
-
y=missing_percentages_sorted.index,
|
122
|
-
x=[threshold] * len(missing_percentages_sorted.index),
|
123
|
-
mode="lines",
|
124
|
-
name="Threshold: {}%".format(threshold),
|
125
|
-
line=dict(color="red", dash="dash"),
|
126
|
-
)
|
127
|
-
|
128
|
-
# Create a layout
|
129
|
-
layout = go.Layout(
|
60
|
+
# Calculate the percentage of missing values in each column
|
61
|
+
missing_percentages = (dataset.df.isnull().sum() / len(dataset.df)) * 100
|
62
|
+
# Only keep entries where missing_percentage > 0
|
63
|
+
missing_percentages = missing_percentages[missing_percentages > 0]
|
64
|
+
# Sort missing value percentages in ascending order
|
65
|
+
missing_percentages_sorted = missing_percentages.sort_values(ascending=True)
|
66
|
+
|
67
|
+
# Create lists to store the x and y values for each bar
|
68
|
+
y_below_threshold = []
|
69
|
+
x_below_threshold = []
|
70
|
+
y_above_threshold = []
|
71
|
+
x_above_threshold = []
|
72
|
+
|
73
|
+
# Iterate through the missing percentages and separate values based on the threshold
|
74
|
+
for index, value in missing_percentages_sorted.items():
|
75
|
+
if value < threshold:
|
76
|
+
y_below_threshold.append(index)
|
77
|
+
x_below_threshold.append(value)
|
78
|
+
else:
|
79
|
+
y_above_threshold.append(index)
|
80
|
+
x_above_threshold.append(value)
|
81
|
+
|
82
|
+
# Create bar traces for values below and above threshold
|
83
|
+
trace_below_threshold = go.Bar(
|
84
|
+
y=y_below_threshold,
|
85
|
+
x=x_below_threshold,
|
86
|
+
marker_color="grey",
|
87
|
+
name="Below Threshold",
|
88
|
+
orientation="h",
|
89
|
+
hovertemplate="Column: %{y}<br>Missing Value Percentage: %{x:.2f}%",
|
90
|
+
)
|
91
|
+
trace_above_threshold = go.Bar(
|
92
|
+
y=y_above_threshold,
|
93
|
+
x=x_above_threshold,
|
94
|
+
marker_color="lightcoral",
|
95
|
+
name="Above Threshold",
|
96
|
+
orientation="h",
|
97
|
+
hovertemplate="Column: %{y}<br>Missing Value Percentage: %{x:.2f}%",
|
98
|
+
)
|
99
|
+
|
100
|
+
# Draw a red line at the specified threshold
|
101
|
+
threshold_line = go.Scatter(
|
102
|
+
y=missing_percentages_sorted.index,
|
103
|
+
x=[threshold] * len(missing_percentages_sorted.index),
|
104
|
+
mode="lines",
|
105
|
+
name="Threshold: {}%".format(threshold),
|
106
|
+
line=dict(color="red", dash="dash"),
|
107
|
+
)
|
108
|
+
|
109
|
+
return go.Figure(
|
110
|
+
data=[trace_below_threshold, trace_above_threshold, threshold_line],
|
111
|
+
layout=go.Layout(
|
130
112
|
title="Missing Values",
|
131
113
|
yaxis=dict(title="Columns"),
|
132
114
|
xaxis=dict(title="Missing Value Percentage (%)", range=[0, 100]),
|
133
115
|
barmode="stack",
|
134
116
|
height=fig_height,
|
135
|
-
)
|
136
|
-
|
137
|
-
# Create a Figure object
|
138
|
-
fig = go.Figure(
|
139
|
-
data=[trace_below_threshold, trace_above_threshold, threshold_line],
|
140
|
-
layout=layout,
|
141
|
-
)
|
142
|
-
|
143
|
-
figure = Figure(for_object=self, key=self.key, figure=fig)
|
144
|
-
return [figure]
|
117
|
+
),
|
118
|
+
)
|
@@ -2,20 +2,22 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
import numpy as np
|
7
6
|
import pandas as pd
|
8
7
|
from arch.unitroot import PhillipsPerron
|
9
8
|
from numpy.linalg import LinAlgError
|
10
9
|
|
10
|
+
from validmind import tags, tasks
|
11
|
+
from validmind.errors import SkipTestError
|
11
12
|
from validmind.logging import get_logger
|
12
|
-
from validmind.vm_models import
|
13
|
+
from validmind.vm_models import VMDataset
|
13
14
|
|
14
15
|
logger = get_logger(__name__)
|
15
16
|
|
16
17
|
|
17
|
-
@
|
18
|
-
|
18
|
+
@tags("time_series_data", "forecasting", "statistical_test", "unit_root_test")
|
19
|
+
@tasks("regression")
|
20
|
+
def PhillipsPerronArch(dataset: VMDataset):
|
19
21
|
"""
|
20
22
|
Assesses the stationarity of time series data in each feature of the ML model using the Phillips-Perron test.
|
21
23
|
|
@@ -55,80 +57,55 @@ class PhillipsPerronArch(Metric):
|
|
55
57
|
- Non-stationary time series must be converted to stationary series through differencing, potentially leading to
|
56
58
|
loss of important data points.
|
57
59
|
"""
|
60
|
+
df = dataset.df.dropna()
|
58
61
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
"time_series_data",
|
64
|
-
"forecasting",
|
65
|
-
"statistical_test",
|
66
|
-
"unit_root_test",
|
67
|
-
]
|
68
|
-
|
69
|
-
def run(self):
|
70
|
-
"""
|
71
|
-
Calculates PP metric for each of the dataset features
|
72
|
-
"""
|
73
|
-
dataset = self.inputs.dataset.df
|
74
|
-
|
75
|
-
# Check if the dataset is a time series
|
76
|
-
if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
|
77
|
-
raise ValueError(
|
78
|
-
"Dataset index must be a datetime or period index for time series analysis."
|
79
|
-
)
|
80
|
-
|
81
|
-
# Preprocessing: Drop rows with any NaN values
|
82
|
-
if dataset.isnull().values.any():
|
83
|
-
logger.warning(
|
84
|
-
"Dataset contains missing values. Rows with NaNs will be dropped."
|
85
|
-
)
|
86
|
-
dataset = dataset.dropna()
|
87
|
-
|
88
|
-
# Convert to numeric and handle non-numeric data
|
89
|
-
dataset = dataset.apply(pd.to_numeric, errors="coerce")
|
90
|
-
|
91
|
-
# Initialize a list to store Phillips-Perron results
|
92
|
-
pp_values = []
|
93
|
-
|
94
|
-
for col in dataset.columns:
|
95
|
-
try:
|
96
|
-
pp = PhillipsPerron(dataset[col].values)
|
97
|
-
pp_values.append(
|
98
|
-
{
|
99
|
-
"Variable": col,
|
100
|
-
"stat": pp.stat,
|
101
|
-
"pvalue": pp.pvalue,
|
102
|
-
"usedlag": pp.lags,
|
103
|
-
"nobs": pp.nobs,
|
104
|
-
}
|
105
|
-
)
|
106
|
-
except LinAlgError as e:
|
107
|
-
logger.error(f"Error processing column '{col}': {e}")
|
108
|
-
pp_values.append(
|
109
|
-
{
|
110
|
-
"Variable": col,
|
111
|
-
"stat": None,
|
112
|
-
"pvalue": None,
|
113
|
-
"usedlag": None,
|
114
|
-
"nobs": None,
|
115
|
-
"error": str(e),
|
116
|
-
}
|
117
|
-
)
|
118
|
-
|
119
|
-
return self.cache_results({"phillips_perron_results": pp_values})
|
120
|
-
|
121
|
-
def summary(self, metric_value):
|
122
|
-
"""
|
123
|
-
Build a table for summarizing the Phillips-Perron results
|
124
|
-
"""
|
125
|
-
pp_results = metric_value["phillips_perron_results"]
|
62
|
+
if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
|
63
|
+
raise ValueError(
|
64
|
+
"Dataset index must be a datetime or period index for time series analysis."
|
65
|
+
)
|
126
66
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
67
|
+
# Filter numeric columns first
|
68
|
+
numeric_columns = df.select_dtypes(include=np.number).columns
|
69
|
+
if not any(col in numeric_columns for col in dataset.feature_columns):
|
70
|
+
raise SkipTestError("No numeric columns found for Phillips-Perron test.")
|
71
|
+
|
72
|
+
pp_table = []
|
73
|
+
|
74
|
+
for col in dataset.feature_columns:
|
75
|
+
# Skip non-numeric columns
|
76
|
+
if col not in numeric_columns:
|
77
|
+
logger.warning(f"Skipping non-numeric column: {col}")
|
78
|
+
continue
|
79
|
+
|
80
|
+
try:
|
81
|
+
# Drop any NaN values for this column
|
82
|
+
series = df[col].dropna()
|
83
|
+
if len(series) == 0:
|
84
|
+
logger.warning(
|
85
|
+
f"Skipping column '{col}': No valid data after dropping NaN values"
|
132
86
|
)
|
133
|
-
|
134
|
-
|
87
|
+
continue
|
88
|
+
|
89
|
+
pp = PhillipsPerron(series.values)
|
90
|
+
pp_table.append(
|
91
|
+
{
|
92
|
+
"Variable": col,
|
93
|
+
"stat": pp.stat,
|
94
|
+
"pvalue": pp.pvalue,
|
95
|
+
"usedlag": pp.lags,
|
96
|
+
"nobs": pp.nobs,
|
97
|
+
}
|
98
|
+
)
|
99
|
+
except LinAlgError as e:
|
100
|
+
logger.error(f"Error processing column '{col}': {e}")
|
101
|
+
continue
|
102
|
+
except Exception as e:
|
103
|
+
logger.error(f"Unexpected error processing column '{col}': {e}")
|
104
|
+
continue
|
105
|
+
|
106
|
+
if not pp_table:
|
107
|
+
raise SkipTestError("No valid columns found for Phillips-Perron test.")
|
108
|
+
|
109
|
+
return {
|
110
|
+
"Phillips-Perron Test Results": pp_table,
|
111
|
+
}
|
@@ -5,10 +5,44 @@
|
|
5
5
|
import matplotlib.pyplot as plt
|
6
6
|
import pandas as pd
|
7
7
|
|
8
|
-
from validmind
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
from validmind import tags, tasks
|
9
|
+
from validmind.errors import SkipTestError
|
10
|
+
from validmind.vm_models import VMDataset
|
11
|
+
|
12
|
+
|
13
|
+
def plot_rolling_statistics(df, col, window_size):
|
14
|
+
rolling_mean = df[col].rolling(window=window_size).mean()
|
15
|
+
rolling_std = df[col].rolling(window=window_size).std()
|
16
|
+
|
17
|
+
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
|
18
|
+
|
19
|
+
ax1.plot(rolling_mean)
|
20
|
+
ax1.set_title(
|
21
|
+
f"Rolling Mean for {col}",
|
22
|
+
fontsize=20,
|
23
|
+
weight="bold",
|
24
|
+
)
|
25
|
+
ax1.set_ylabel("")
|
26
|
+
ax1.tick_params(axis="both", labelsize=18)
|
27
|
+
ax1.legend()
|
28
|
+
|
29
|
+
ax2.plot(rolling_std, label="Rolling Standard Deviation", color="orange")
|
30
|
+
ax2.set_title(
|
31
|
+
f"Rolling STD for {col}",
|
32
|
+
fontsize=20,
|
33
|
+
weight="bold",
|
34
|
+
)
|
35
|
+
ax2.set_xlabel("")
|
36
|
+
ax2.set_ylabel("")
|
37
|
+
ax2.tick_params(axis="both", labelsize=18)
|
38
|
+
ax2.legend()
|
39
|
+
|
40
|
+
return fig
|
41
|
+
|
42
|
+
|
43
|
+
@tags("time_series_data", "visualization", "stationarity")
|
44
|
+
@tasks("regression")
|
45
|
+
def RollingStatsPlot(dataset: VMDataset, window_size: int = 12):
|
12
46
|
"""
|
13
47
|
Evaluates the stationarity of time series data by plotting its rolling mean and standard deviation over a specified
|
14
48
|
window.
|
@@ -58,81 +92,16 @@ class RollingStatsPlot(Metric):
|
|
58
92
|
such as through statistical tests. Therefore, the interpretation is subjective and depends heavily on modeler
|
59
93
|
discretion.
|
60
94
|
"""
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
Plot rolling mean and rolling standard deviation in different subplots for a given series.
|
71
|
-
:param series: Pandas Series with time-series data
|
72
|
-
:param window_size: Window size for the rolling calculations
|
73
|
-
:param ax1: Axis object for the rolling mean plot
|
74
|
-
:param ax2: Axis object for the rolling standard deviation plot
|
75
|
-
"""
|
76
|
-
rolling_mean = (
|
77
|
-
self.inputs.dataset.df[col].rolling(window=int(window_size)).mean()
|
78
|
-
)
|
79
|
-
rolling_std = self.inputs.dataset.df[col].rolling(window=int(window_size)).std()
|
80
|
-
|
81
|
-
# Create a new figure and axis objects
|
82
|
-
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
|
83
|
-
|
84
|
-
ax1.plot(rolling_mean)
|
85
|
-
|
86
|
-
ax1.set_title(
|
87
|
-
f"Rolling Mean for {col}",
|
88
|
-
fontsize=20,
|
89
|
-
weight="bold",
|
90
|
-
)
|
91
|
-
ax1.set_ylabel("")
|
92
|
-
ax1.tick_params(axis="both", labelsize=18)
|
93
|
-
ax1.legend()
|
94
|
-
|
95
|
-
ax2.plot(rolling_std, label="Rolling Standard Deviation", color="orange")
|
96
|
-
ax2.set_title(
|
97
|
-
f"Rolling STD for {col}",
|
98
|
-
fontsize=20,
|
99
|
-
weight="bold",
|
100
|
-
)
|
101
|
-
ax2.set_xlabel("")
|
102
|
-
ax2.set_ylabel("")
|
103
|
-
ax2.tick_params(axis="both", labelsize=18)
|
104
|
-
ax2.legend()
|
105
|
-
|
106
|
-
return fig
|
107
|
-
|
108
|
-
def run(self):
|
109
|
-
if "window_size" not in self.params:
|
110
|
-
raise ValueError("Window size must be provided in params")
|
111
|
-
|
112
|
-
# Check if index is datetime
|
113
|
-
if not pd.api.types.is_datetime64_any_dtype(self.inputs.dataset.df.index):
|
114
|
-
raise ValueError("Index must be a datetime type")
|
115
|
-
|
116
|
-
window_size = self.params["window_size"]
|
117
|
-
df = self.inputs.dataset.df.dropna()
|
118
|
-
|
119
|
-
if not set(df.columns).issubset(set(df.columns)):
|
120
|
-
raise ValueError("Provided 'columns' must exist in the dataset")
|
121
|
-
|
122
|
-
figures = []
|
123
|
-
|
124
|
-
for col in df.columns:
|
125
|
-
fig = self.plot_rolling_statistics(col, window_size=window_size)
|
126
|
-
|
127
|
-
figures.append(
|
128
|
-
Figure(
|
129
|
-
for_object=self,
|
130
|
-
key=f"{self.key}:{col}",
|
131
|
-
figure=fig,
|
132
|
-
)
|
95
|
+
if not pd.api.types.is_datetime64_any_dtype(dataset.df.index):
|
96
|
+
raise SkipTestError("Index must be a datetime type")
|
97
|
+
|
98
|
+
return tuple(
|
99
|
+
[
|
100
|
+
plot_rolling_statistics(
|
101
|
+
df=dataset.df.dropna(),
|
102
|
+
col=col,
|
103
|
+
window_size=window_size,
|
133
104
|
)
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
return self.cache_results(figures=figures)
|
105
|
+
for col in dataset.feature_columns
|
106
|
+
]
|
107
|
+
)
|