validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,8 +2,6 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import warnings
|
6
|
-
|
7
5
|
import numpy as np
|
8
6
|
import pandas as pd
|
9
7
|
import plotly.graph_objects as go
|
@@ -11,13 +9,17 @@ from plotly.subplots import make_subplots
|
|
11
9
|
from scipy import stats
|
12
10
|
from statsmodels.tsa.seasonal import seasonal_decompose
|
13
11
|
|
12
|
+
from validmind import tags, tasks
|
13
|
+
from validmind.errors import SkipTestError
|
14
14
|
from validmind.logging import get_logger
|
15
|
-
from validmind.vm_models import
|
15
|
+
from validmind.vm_models import VMDataset
|
16
16
|
|
17
17
|
logger = get_logger(__name__)
|
18
18
|
|
19
19
|
|
20
|
-
|
20
|
+
@tags("time_series_data", "seasonality", "statsmodels")
|
21
|
+
@tasks("regression")
|
22
|
+
def SeasonalDecompose(dataset: VMDataset, seasonal_model: str = "additive"):
|
21
23
|
"""
|
22
24
|
Assesses patterns and seasonality in a time series dataset by decomposing its features into foundational components.
|
23
25
|
|
@@ -59,183 +61,99 @@ class SeasonalDecompose(Metric):
|
|
59
61
|
- **Unreliability with Noisy Datasets**: Produces unreliable results when used with datasets that contain heavy
|
60
62
|
noise.
|
61
63
|
"""
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
#
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
vertical_spacing=0.1,
|
159
|
-
)
|
160
|
-
|
161
|
-
# Observed
|
162
|
-
fig.add_trace(
|
163
|
-
go.Scatter(x=sd.observed.index, y=sd.observed, name="Observed"),
|
164
|
-
row=1,
|
165
|
-
col=1,
|
166
|
-
)
|
167
|
-
|
168
|
-
# Trend
|
169
|
-
fig.add_trace(
|
170
|
-
go.Scatter(x=sd.trend.index, y=sd.trend, name="Trend"),
|
171
|
-
row=1,
|
172
|
-
col=2,
|
173
|
-
)
|
174
|
-
|
175
|
-
# Seasonal
|
176
|
-
fig.add_trace(
|
177
|
-
go.Scatter(x=sd.seasonal.index, y=sd.seasonal, name="Seasonal"),
|
178
|
-
row=2,
|
179
|
-
col=1,
|
180
|
-
)
|
181
|
-
|
182
|
-
# Residuals
|
183
|
-
fig.add_trace(
|
184
|
-
go.Scatter(x=sd.resid.index, y=sd.resid, name="Residuals"),
|
185
|
-
row=2,
|
186
|
-
col=2,
|
187
|
-
)
|
188
|
-
|
189
|
-
# Histogram with KDE
|
190
|
-
residuals = sd.resid.dropna()
|
191
|
-
fig.add_trace(
|
192
|
-
go.Histogram(x=residuals, nbinsx=100, name="Residuals"),
|
193
|
-
row=3,
|
194
|
-
col=1,
|
195
|
-
)
|
196
|
-
|
197
|
-
# Normal Q-Q plot
|
198
|
-
qq = stats.probplot(residuals, plot=None)
|
199
|
-
qq_line_slope, qq_line_intercept = stats.linregress(
|
200
|
-
qq[0][0], qq[0][1]
|
201
|
-
)[:2]
|
202
|
-
qq_line = qq_line_slope * np.array(qq[0][0]) + qq_line_intercept
|
203
|
-
|
204
|
-
fig.add_trace(
|
205
|
-
go.Scatter(
|
206
|
-
x=qq[0][0], y=qq[0][1], mode="markers", name="QQ plot"
|
207
|
-
),
|
208
|
-
row=3,
|
209
|
-
col=2,
|
210
|
-
)
|
211
|
-
fig.add_trace(
|
212
|
-
go.Scatter(
|
213
|
-
x=qq[0][0],
|
214
|
-
y=qq_line,
|
215
|
-
mode="lines",
|
216
|
-
name="QQ line",
|
217
|
-
),
|
218
|
-
row=3,
|
219
|
-
col=2,
|
220
|
-
)
|
221
|
-
|
222
|
-
fig.update_layout(
|
223
|
-
height=1000,
|
224
|
-
title_text=f"Seasonal Decomposition for {col}",
|
225
|
-
showlegend=False,
|
226
|
-
)
|
227
|
-
|
228
|
-
figures.append(
|
229
|
-
Figure(
|
230
|
-
for_object=self,
|
231
|
-
key=f"{self.key}:{col}",
|
232
|
-
figure=fig,
|
233
|
-
)
|
234
|
-
)
|
235
|
-
else:
|
236
|
-
warnings.warn(
|
237
|
-
f"No frequency could be inferred for variable '{col}'. "
|
238
|
-
"Skipping seasonal decomposition and plots for this variable."
|
239
|
-
)
|
240
|
-
|
241
|
-
return self.cache_results(results, figures=figures)
|
64
|
+
df = dataset.df
|
65
|
+
|
66
|
+
figures = []
|
67
|
+
|
68
|
+
for col in df.columns:
|
69
|
+
series = df[col].dropna()
|
70
|
+
|
71
|
+
if series[np.isfinite(series)].empty:
|
72
|
+
logger.warning(f"No finite values found for {col}, skipping")
|
73
|
+
continue
|
74
|
+
|
75
|
+
inferred_freq = pd.infer_freq(series.index)
|
76
|
+
if inferred_freq is None:
|
77
|
+
logger.warning(f"No frequency found for {col}, skipping")
|
78
|
+
continue
|
79
|
+
|
80
|
+
sd = seasonal_decompose(series[np.isfinite(series)], model=seasonal_model)
|
81
|
+
|
82
|
+
# Create subplots using Plotly
|
83
|
+
fig = make_subplots(
|
84
|
+
rows=3,
|
85
|
+
cols=2,
|
86
|
+
subplot_titles=(
|
87
|
+
"Observed",
|
88
|
+
"Trend",
|
89
|
+
"Seasonal",
|
90
|
+
"Residuals",
|
91
|
+
"Histogram and KDE of Residuals",
|
92
|
+
"Normal Q-Q Plot of Residuals",
|
93
|
+
),
|
94
|
+
vertical_spacing=0.1,
|
95
|
+
)
|
96
|
+
|
97
|
+
# Observed
|
98
|
+
fig.add_trace(
|
99
|
+
go.Scatter(x=sd.observed.index, y=sd.observed, name="Observed"),
|
100
|
+
row=1,
|
101
|
+
col=1,
|
102
|
+
)
|
103
|
+
# Trend
|
104
|
+
fig.add_trace(
|
105
|
+
go.Scatter(x=sd.trend.index, y=sd.trend, name="Trend"),
|
106
|
+
row=1,
|
107
|
+
col=2,
|
108
|
+
)
|
109
|
+
# Seasonal
|
110
|
+
fig.add_trace(
|
111
|
+
go.Scatter(x=sd.seasonal.index, y=sd.seasonal, name="Seasonal"),
|
112
|
+
row=2,
|
113
|
+
col=1,
|
114
|
+
)
|
115
|
+
# Residuals
|
116
|
+
fig.add_trace(
|
117
|
+
go.Scatter(x=sd.resid.index, y=sd.resid, name="Residuals"),
|
118
|
+
row=2,
|
119
|
+
col=2,
|
120
|
+
)
|
121
|
+
# Histogram with KDE
|
122
|
+
residuals = sd.resid.dropna()
|
123
|
+
fig.add_trace(
|
124
|
+
go.Histogram(x=residuals, nbinsx=100, name="Residuals"),
|
125
|
+
row=3,
|
126
|
+
col=1,
|
127
|
+
)
|
128
|
+
# Normal Q-Q plot
|
129
|
+
qq = stats.probplot(residuals, plot=None)
|
130
|
+
qq_line_slope, qq_line_intercept = stats.linregress(qq[0][0], qq[0][1])[:2]
|
131
|
+
qq_line = qq_line_slope * np.array(qq[0][0]) + qq_line_intercept
|
132
|
+
fig.add_trace(
|
133
|
+
go.Scatter(x=qq[0][0], y=qq[0][1], mode="markers", name="QQ plot"),
|
134
|
+
row=3,
|
135
|
+
col=2,
|
136
|
+
)
|
137
|
+
fig.add_trace(
|
138
|
+
go.Scatter(
|
139
|
+
x=qq[0][0],
|
140
|
+
y=qq_line,
|
141
|
+
mode="lines",
|
142
|
+
name="QQ line",
|
143
|
+
),
|
144
|
+
row=3,
|
145
|
+
col=2,
|
146
|
+
)
|
147
|
+
|
148
|
+
fig.update_layout(
|
149
|
+
height=1000,
|
150
|
+
title_text=f"Seasonal Decomposition for {col}",
|
151
|
+
showlegend=False,
|
152
|
+
)
|
153
|
+
|
154
|
+
figures.append(fig)
|
155
|
+
|
156
|
+
if not figures:
|
157
|
+
raise SkipTestError("No valid features found for seasonal decomposition")
|
158
|
+
|
159
|
+
return tuple(figures)
|
@@ -2,23 +2,15 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
from typing import List
|
7
|
-
|
8
5
|
from ydata_profiling.config import Settings
|
9
6
|
from ydata_profiling.model.typeset import ProfilingTypeSet
|
10
7
|
|
11
|
-
from validmind
|
12
|
-
ResultSummary,
|
13
|
-
ResultTable,
|
14
|
-
ResultTableMetadata,
|
15
|
-
ThresholdTest,
|
16
|
-
ThresholdTestResult,
|
17
|
-
)
|
8
|
+
from validmind import tags, tasks
|
18
9
|
|
19
10
|
|
20
|
-
@
|
21
|
-
|
11
|
+
@tags("data_quality", "tabular_data")
|
12
|
+
@tasks("classification", "regression")
|
13
|
+
def Skewness(dataset, max_threshold=1):
|
22
14
|
"""
|
23
15
|
Evaluates the skewness of numerical data in a dataset to check against a defined threshold, aiming to ensure data
|
24
16
|
quality and optimize model performance.
|
@@ -57,59 +49,30 @@ class Skewness(ThresholdTest):
|
|
57
49
|
- Subjective threshold for risk grading, requiring expert input and recurrent iterations for refinement.
|
58
50
|
"""
|
59
51
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
[
|
70
|
-
|
71
|
-
|
52
|
+
typeset = ProfilingTypeSet(Settings())
|
53
|
+
dataset_types = typeset.infer_type(dataset.df)
|
54
|
+
|
55
|
+
skewness = dataset.df.skew(numeric_only=True)
|
56
|
+
|
57
|
+
results_table = []
|
58
|
+
passed = True
|
59
|
+
|
60
|
+
for col in skewness.index:
|
61
|
+
if str(dataset_types[col]) != "Numeric":
|
62
|
+
continue
|
63
|
+
|
64
|
+
col_skewness = skewness[col]
|
65
|
+
col_passed = abs(col_skewness) < max_threshold
|
66
|
+
passed = passed and col_passed
|
67
|
+
|
68
|
+
results_table.append(
|
72
69
|
{
|
73
|
-
"Column":
|
74
|
-
"Skewness":
|
75
|
-
"Pass/Fail": "Pass" if
|
70
|
+
"Column": col,
|
71
|
+
"Skewness": col_skewness,
|
72
|
+
"Pass/Fail": "Pass" if col_passed else "Fail",
|
76
73
|
}
|
77
|
-
for result in results
|
78
|
-
]
|
79
|
-
return ResultSummary(
|
80
|
-
results=[
|
81
|
-
ResultTable(
|
82
|
-
data=results_table,
|
83
|
-
metadata=ResultTableMetadata(title="Skewness Results for Dataset"),
|
84
|
-
)
|
85
|
-
]
|
86
74
|
)
|
87
75
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
skewness = self.inputs.dataset.df.skew(numeric_only=True)
|
93
|
-
|
94
|
-
results = []
|
95
|
-
passed = []
|
96
|
-
|
97
|
-
for col in skewness.index:
|
98
|
-
# Only calculate skewness for numerical columns
|
99
|
-
if str(dataset_types[col]) != "Numeric":
|
100
|
-
continue
|
101
|
-
|
102
|
-
col_skewness = skewness[col]
|
103
|
-
col_pass = abs(col_skewness) < self.params["max_threshold"]
|
104
|
-
passed.append(col_pass)
|
105
|
-
results.append(
|
106
|
-
ThresholdTestResult(
|
107
|
-
column=col,
|
108
|
-
passed=col_pass,
|
109
|
-
values={
|
110
|
-
"skewness": col_skewness,
|
111
|
-
},
|
112
|
-
)
|
113
|
-
)
|
114
|
-
|
115
|
-
return self.cache_results(results, passed=all(passed))
|
76
|
+
return {
|
77
|
+
"Skewness Results for Dataset": results_table,
|
78
|
+
}, passed
|
@@ -3,12 +3,17 @@
|
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
5
|
import matplotlib.pyplot as plt
|
6
|
+
import pandas as pd
|
6
7
|
import seaborn as sns
|
7
8
|
|
8
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.errors import SkipTestError
|
11
|
+
from validmind.vm_models import VMDataset
|
9
12
|
|
10
13
|
|
11
|
-
|
14
|
+
@tags("time_series_data", "visualization")
|
15
|
+
@tasks("regression")
|
16
|
+
def SpreadPlot(dataset: VMDataset):
|
12
17
|
"""
|
13
18
|
Assesses potential correlations between pairs of time series variables through visualization to enhance
|
14
19
|
understanding of their relationships.
|
@@ -51,66 +56,38 @@ class SpreadPlot(Metric):
|
|
51
56
|
plots.
|
52
57
|
- Might not completely capture intricate non-linear relationships between the variables.
|
53
58
|
"""
|
59
|
+
# Validate that the index is datetime
|
60
|
+
if not isinstance(dataset.df.index, pd.DatetimeIndex):
|
61
|
+
raise SkipTestError("Index must be a datetime type for time series analysis")
|
54
62
|
|
55
|
-
|
56
|
-
required_inputs = ["dataset"]
|
57
|
-
tasks = ["regression"]
|
58
|
-
tags = ["time_series_data", "visualization"]
|
63
|
+
df = dataset.df.dropna()
|
59
64
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
:param ax: Axis object for the spread plot
|
67
|
-
"""
|
68
|
-
spread = series1 - series2
|
65
|
+
# Get all unique pairs of feature columns
|
66
|
+
feature_pairs = [
|
67
|
+
(dataset.feature_columns[i], dataset.feature_columns[j])
|
68
|
+
for i in range(len(dataset.feature_columns))
|
69
|
+
for j in range(i + 1, len(dataset.feature_columns))
|
70
|
+
]
|
69
71
|
|
70
|
-
|
71
|
-
_, ax = plt.subplots()
|
72
|
+
figures = []
|
72
73
|
|
73
|
-
|
74
|
+
for var1, var2 in feature_pairs:
|
75
|
+
fig, ax = plt.subplots()
|
76
|
+
fig.suptitle(
|
77
|
+
f"Spread between {var1} and {var2}",
|
78
|
+
fontsize=20,
|
79
|
+
weight="bold",
|
80
|
+
y=0.95,
|
81
|
+
)
|
74
82
|
|
75
|
-
|
83
|
+
sns.lineplot(
|
84
|
+
data=df[var1] - df[var2],
|
85
|
+
ax=ax,
|
86
|
+
)
|
76
87
|
|
77
|
-
|
78
|
-
|
88
|
+
ax.set_xlabel("")
|
89
|
+
ax.tick_params(axis="both", labelsize=18)
|
79
90
|
|
80
|
-
figures
|
81
|
-
columns = df.columns
|
82
|
-
num_vars = len(columns)
|
91
|
+
figures.append(fig)
|
83
92
|
|
84
|
-
|
85
|
-
for j in range(i + 1, num_vars):
|
86
|
-
var1 = columns[i]
|
87
|
-
var2 = columns[j]
|
88
|
-
|
89
|
-
series1 = df[var1]
|
90
|
-
series2 = df[var2]
|
91
|
-
|
92
|
-
fig, ax = plt.subplots()
|
93
|
-
fig.suptitle(
|
94
|
-
f"Spread between {var1} and {var2}",
|
95
|
-
fontsize=20,
|
96
|
-
weight="bold",
|
97
|
-
y=0.95,
|
98
|
-
)
|
99
|
-
|
100
|
-
self.plot_spread(series1, series2, ax=ax)
|
101
|
-
|
102
|
-
ax.set_xlabel("")
|
103
|
-
ax.tick_params(axis="both", labelsize=18)
|
104
|
-
|
105
|
-
# Do this if you want to prevent the figure from being displayed
|
106
|
-
plt.close("all")
|
107
|
-
|
108
|
-
figures.append(
|
109
|
-
Figure(
|
110
|
-
for_object=self,
|
111
|
-
key=f"{self.key}:{var1}_{var2}",
|
112
|
-
figure=fig,
|
113
|
-
)
|
114
|
-
)
|
115
|
-
|
116
|
-
return self.cache_results(figures=figures)
|
93
|
+
return tuple(figures)
|
@@ -2,13 +2,16 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import pandas as pd
|
6
5
|
import plotly.graph_objs as go
|
7
6
|
|
8
|
-
from validmind
|
7
|
+
from validmind import tags, tasks
|
8
|
+
from validmind.errors import SkipTestError
|
9
|
+
from validmind.vm_models import VMDataset
|
9
10
|
|
10
11
|
|
11
|
-
|
12
|
+
@tags("tabular_data", "visualization")
|
13
|
+
@tasks("classification", "regression")
|
14
|
+
def TabularCategoricalBarPlots(dataset: VMDataset):
|
12
15
|
"""
|
13
16
|
Generates and visualizes bar plots for each category in categorical features to evaluate the dataset's composition.
|
14
17
|
|
@@ -46,67 +49,45 @@ class TabularCategoricalBarPlots(Metric):
|
|
46
49
|
- Offers no insights into the model's performance or precision, but rather provides a descriptive analysis of the
|
47
50
|
input.
|
48
51
|
"""
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
#
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
"#FECB52",
|
78
|
-
]
|
79
|
-
|
80
|
-
figures = []
|
81
|
-
for col in categorical_columns:
|
82
|
-
counts = df[col].value_counts()
|
83
|
-
|
84
|
-
fig = go.Figure()
|
85
|
-
fig.add_trace(
|
86
|
-
go.Bar(
|
87
|
-
x=counts.index,
|
88
|
-
y=counts.values,
|
89
|
-
name=col,
|
90
|
-
marker_color=color_sequence[: len(counts)],
|
91
|
-
)
|
92
|
-
) # add colored bar plot trace
|
93
|
-
fig.update_layout(
|
94
|
-
title_text=f"{col}", # title of plot
|
95
|
-
xaxis_title_text="", # xaxis label
|
96
|
-
yaxis_title_text="", # yaxis label
|
97
|
-
autosize=False,
|
98
|
-
width=500,
|
99
|
-
height=500,
|
100
|
-
margin=dict(l=50, r=50, b=100, t=100, pad=4),
|
101
|
-
)
|
102
|
-
figures.append(
|
103
|
-
Figure(
|
104
|
-
for_object=self,
|
105
|
-
key=f"{self.key}:{col}",
|
106
|
-
figure=fig,
|
107
|
-
)
|
52
|
+
if not dataset.feature_columns_categorical:
|
53
|
+
raise SkipTestError("No categorical columns found in the dataset")
|
54
|
+
|
55
|
+
color_sequence = [
|
56
|
+
"#636EFA",
|
57
|
+
"#EF553B",
|
58
|
+
"#00CC96",
|
59
|
+
"#AB63FA",
|
60
|
+
"#FFA15A",
|
61
|
+
"#19D3F3",
|
62
|
+
"#FF6692",
|
63
|
+
"#B6E880",
|
64
|
+
"#FF97FF",
|
65
|
+
"#FECB52",
|
66
|
+
]
|
67
|
+
|
68
|
+
figures = []
|
69
|
+
|
70
|
+
for col in dataset.feature_columns_categorical:
|
71
|
+
counts = dataset.df[col].value_counts()
|
72
|
+
|
73
|
+
fig = go.Figure()
|
74
|
+
fig.add_trace(
|
75
|
+
go.Bar(
|
76
|
+
x=counts.index,
|
77
|
+
y=counts.values,
|
78
|
+
name=col,
|
79
|
+
marker_color=color_sequence[: len(counts)],
|
108
80
|
)
|
109
|
-
|
110
|
-
return self.cache_results(
|
111
|
-
figures=figures,
|
112
81
|
)
|
82
|
+
fig.update_layout(
|
83
|
+
title_text=f"{col}",
|
84
|
+
xaxis_title_text="",
|
85
|
+
yaxis_title_text="",
|
86
|
+
autosize=False,
|
87
|
+
width=500,
|
88
|
+
height=500,
|
89
|
+
margin=dict(l=50, r=50, b=100, t=100, pad=4),
|
90
|
+
)
|
91
|
+
figures.append(fig)
|
92
|
+
|
93
|
+
return tuple(figures)
|