validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,8 +2,6 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import warnings
6
-
7
5
  import numpy as np
8
6
  import pandas as pd
9
7
  import plotly.graph_objects as go
@@ -11,13 +9,17 @@ from plotly.subplots import make_subplots
11
9
  from scipy import stats
12
10
  from statsmodels.tsa.seasonal import seasonal_decompose
13
11
 
12
+ from validmind import tags, tasks
13
+ from validmind.errors import SkipTestError
14
14
  from validmind.logging import get_logger
15
- from validmind.vm_models import Figure, Metric
15
+ from validmind.vm_models import VMDataset
16
16
 
17
17
  logger = get_logger(__name__)
18
18
 
19
19
 
20
- class SeasonalDecompose(Metric):
20
+ @tags("time_series_data", "seasonality", "statsmodels")
21
+ @tasks("regression")
22
+ def SeasonalDecompose(dataset: VMDataset, seasonal_model: str = "additive"):
21
23
  """
22
24
  Assesses patterns and seasonality in a time series dataset by decomposing its features into foundational components.
23
25
 
@@ -59,183 +61,99 @@ class SeasonalDecompose(Metric):
59
61
  - **Unreliability with Noisy Datasets**: Produces unreliable results when used with datasets that contain heavy
60
62
  noise.
61
63
  """
62
-
63
- name = "seasonal_decompose"
64
- required_inputs = ["dataset"]
65
- default_params = {"seasonal_model": "additive"}
66
- tasks = ["regression"]
67
- tags = ["time_series_data", "seasonality", "statsmodels"]
68
-
69
- def store_seasonal_decompose(self, column, sd_one_column):
70
- """
71
- Stores the seasonal decomposition results in the test context so they
72
- can be re-used by other tests. Note we store one `sd` at a time for every
73
- column in the dataset.
74
- """
75
- sd_all_columns = self.context.get_context_data("seasonal_decompose") or dict()
76
- sd_all_columns[column] = sd_one_column
77
- self.context.set_context_data("seasonal_decompose", sd_all_columns)
78
-
79
- def serialize_seasonal_decompose(self, sd):
80
- """
81
- Serializes the seasonal decomposition results for one column into a
82
- JSON serializable format that can be sent to the API.
83
- """
84
- results = {
85
- "observed": sd.observed,
86
- "trend": sd.trend,
87
- "seasonal": sd.seasonal,
88
- "resid": sd.resid,
89
- }
90
-
91
- # Convert pandas Series to DataFrames, reset their indices, and convert the dates to strings
92
- dfs = [
93
- pd.DataFrame(series)
94
- .pipe(
95
- lambda x: (
96
- x.reset_index()
97
- if not isinstance(x.index, pd.DatetimeIndex)
98
- else x.reset_index().rename(columns={x.index.name: "Date"})
99
- )
100
- )
101
- .assign(
102
- Date=lambda x: (
103
- x["Date"].astype(str)
104
- if "Date" in x.columns
105
- else x.index.astype(str)
106
- )
107
- )
108
- for series in results.values()
109
- ]
110
-
111
- # Merge DataFrames on the 'Date' column
112
- merged_df = dfs[0]
113
- for df in dfs[1:]:
114
- merged_df = merged_df.merge(df, on="Date")
115
- # Convert the merged DataFrame into a list of dictionaries
116
- return merged_df.to_dict("records")
117
-
118
- def run(self):
119
- # Parse input parameters
120
- if "seasonal_model" not in self.params:
121
- raise ValueError("seasonal_model must be provided in params")
122
- seasonal_model = self.params["seasonal_model"]
123
-
124
- df = self.inputs.dataset.df
125
-
126
- results = {}
127
- figures = []
128
-
129
- for col in df.columns:
130
- series = df[col].dropna()
131
-
132
- # Check for non-finite values and handle them
133
- if not series[np.isfinite(series)].empty:
134
- inferred_freq = pd.infer_freq(series.index)
135
-
136
- if inferred_freq is not None:
137
-
138
- # Only take finite values to seasonal_decompose
139
- sd = seasonal_decompose(
140
- series[np.isfinite(series)], model=seasonal_model
141
- )
142
- self.store_seasonal_decompose(col, sd)
143
-
144
- results[col] = self.serialize_seasonal_decompose(sd)
145
-
146
- # Create subplots using Plotly
147
- fig = make_subplots(
148
- rows=3,
149
- cols=2,
150
- subplot_titles=(
151
- "Observed",
152
- "Trend",
153
- "Seasonal",
154
- "Residuals",
155
- "Histogram and KDE of Residuals",
156
- "Normal Q-Q Plot of Residuals",
157
- ),
158
- vertical_spacing=0.1,
159
- )
160
-
161
- # Observed
162
- fig.add_trace(
163
- go.Scatter(x=sd.observed.index, y=sd.observed, name="Observed"),
164
- row=1,
165
- col=1,
166
- )
167
-
168
- # Trend
169
- fig.add_trace(
170
- go.Scatter(x=sd.trend.index, y=sd.trend, name="Trend"),
171
- row=1,
172
- col=2,
173
- )
174
-
175
- # Seasonal
176
- fig.add_trace(
177
- go.Scatter(x=sd.seasonal.index, y=sd.seasonal, name="Seasonal"),
178
- row=2,
179
- col=1,
180
- )
181
-
182
- # Residuals
183
- fig.add_trace(
184
- go.Scatter(x=sd.resid.index, y=sd.resid, name="Residuals"),
185
- row=2,
186
- col=2,
187
- )
188
-
189
- # Histogram with KDE
190
- residuals = sd.resid.dropna()
191
- fig.add_trace(
192
- go.Histogram(x=residuals, nbinsx=100, name="Residuals"),
193
- row=3,
194
- col=1,
195
- )
196
-
197
- # Normal Q-Q plot
198
- qq = stats.probplot(residuals, plot=None)
199
- qq_line_slope, qq_line_intercept = stats.linregress(
200
- qq[0][0], qq[0][1]
201
- )[:2]
202
- qq_line = qq_line_slope * np.array(qq[0][0]) + qq_line_intercept
203
-
204
- fig.add_trace(
205
- go.Scatter(
206
- x=qq[0][0], y=qq[0][1], mode="markers", name="QQ plot"
207
- ),
208
- row=3,
209
- col=2,
210
- )
211
- fig.add_trace(
212
- go.Scatter(
213
- x=qq[0][0],
214
- y=qq_line,
215
- mode="lines",
216
- name="QQ line",
217
- ),
218
- row=3,
219
- col=2,
220
- )
221
-
222
- fig.update_layout(
223
- height=1000,
224
- title_text=f"Seasonal Decomposition for {col}",
225
- showlegend=False,
226
- )
227
-
228
- figures.append(
229
- Figure(
230
- for_object=self,
231
- key=f"{self.key}:{col}",
232
- figure=fig,
233
- )
234
- )
235
- else:
236
- warnings.warn(
237
- f"No frequency could be inferred for variable '{col}'. "
238
- "Skipping seasonal decomposition and plots for this variable."
239
- )
240
-
241
- return self.cache_results(results, figures=figures)
64
+ df = dataset.df
65
+
66
+ figures = []
67
+
68
+ for col in df.columns:
69
+ series = df[col].dropna()
70
+
71
+ if series[np.isfinite(series)].empty:
72
+ logger.warning(f"No finite values found for {col}, skipping")
73
+ continue
74
+
75
+ inferred_freq = pd.infer_freq(series.index)
76
+ if inferred_freq is None:
77
+ logger.warning(f"No frequency found for {col}, skipping")
78
+ continue
79
+
80
+ sd = seasonal_decompose(series[np.isfinite(series)], model=seasonal_model)
81
+
82
+ # Create subplots using Plotly
83
+ fig = make_subplots(
84
+ rows=3,
85
+ cols=2,
86
+ subplot_titles=(
87
+ "Observed",
88
+ "Trend",
89
+ "Seasonal",
90
+ "Residuals",
91
+ "Histogram and KDE of Residuals",
92
+ "Normal Q-Q Plot of Residuals",
93
+ ),
94
+ vertical_spacing=0.1,
95
+ )
96
+
97
+ # Observed
98
+ fig.add_trace(
99
+ go.Scatter(x=sd.observed.index, y=sd.observed, name="Observed"),
100
+ row=1,
101
+ col=1,
102
+ )
103
+ # Trend
104
+ fig.add_trace(
105
+ go.Scatter(x=sd.trend.index, y=sd.trend, name="Trend"),
106
+ row=1,
107
+ col=2,
108
+ )
109
+ # Seasonal
110
+ fig.add_trace(
111
+ go.Scatter(x=sd.seasonal.index, y=sd.seasonal, name="Seasonal"),
112
+ row=2,
113
+ col=1,
114
+ )
115
+ # Residuals
116
+ fig.add_trace(
117
+ go.Scatter(x=sd.resid.index, y=sd.resid, name="Residuals"),
118
+ row=2,
119
+ col=2,
120
+ )
121
+ # Histogram with KDE
122
+ residuals = sd.resid.dropna()
123
+ fig.add_trace(
124
+ go.Histogram(x=residuals, nbinsx=100, name="Residuals"),
125
+ row=3,
126
+ col=1,
127
+ )
128
+ # Normal Q-Q plot
129
+ qq = stats.probplot(residuals, plot=None)
130
+ qq_line_slope, qq_line_intercept = stats.linregress(qq[0][0], qq[0][1])[:2]
131
+ qq_line = qq_line_slope * np.array(qq[0][0]) + qq_line_intercept
132
+ fig.add_trace(
133
+ go.Scatter(x=qq[0][0], y=qq[0][1], mode="markers", name="QQ plot"),
134
+ row=3,
135
+ col=2,
136
+ )
137
+ fig.add_trace(
138
+ go.Scatter(
139
+ x=qq[0][0],
140
+ y=qq_line,
141
+ mode="lines",
142
+ name="QQ line",
143
+ ),
144
+ row=3,
145
+ col=2,
146
+ )
147
+
148
+ fig.update_layout(
149
+ height=1000,
150
+ title_text=f"Seasonal Decomposition for {col}",
151
+ showlegend=False,
152
+ )
153
+
154
+ figures.append(fig)
155
+
156
+ if not figures:
157
+ raise SkipTestError("No valid features found for seasonal decomposition")
158
+
159
+ return tuple(figures)
@@ -2,23 +2,15 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from typing import List
7
-
8
5
  from ydata_profiling.config import Settings
9
6
  from ydata_profiling.model.typeset import ProfilingTypeSet
10
7
 
11
- from validmind.vm_models import (
12
- ResultSummary,
13
- ResultTable,
14
- ResultTableMetadata,
15
- ThresholdTest,
16
- ThresholdTestResult,
17
- )
8
+ from validmind import tags, tasks
18
9
 
19
10
 
20
- @dataclass
21
- class Skewness(ThresholdTest):
11
+ @tags("data_quality", "tabular_data")
12
+ @tasks("classification", "regression")
13
+ def Skewness(dataset, max_threshold=1):
22
14
  """
23
15
  Evaluates the skewness of numerical data in a dataset to check against a defined threshold, aiming to ensure data
24
16
  quality and optimize model performance.
@@ -57,59 +49,30 @@ class Skewness(ThresholdTest):
57
49
  - Subjective threshold for risk grading, requiring expert input and recurrent iterations for refinement.
58
50
  """
59
51
 
60
- name = "skewness"
61
- required_inputs = ["dataset"]
62
- default_params = {"max_threshold": 1}
63
- tasks = ["classification", "regression"]
64
- tags = ["tabular_data", "data_quality"]
65
-
66
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
67
- """
68
- The skewness test returns results like these:
69
- [{"values": {"skewness": 1.0}, "column": "NumOfProducts", "passed": false}]
70
- """
71
- results_table = [
52
+ typeset = ProfilingTypeSet(Settings())
53
+ dataset_types = typeset.infer_type(dataset.df)
54
+
55
+ skewness = dataset.df.skew(numeric_only=True)
56
+
57
+ results_table = []
58
+ passed = True
59
+
60
+ for col in skewness.index:
61
+ if str(dataset_types[col]) != "Numeric":
62
+ continue
63
+
64
+ col_skewness = skewness[col]
65
+ col_passed = abs(col_skewness) < max_threshold
66
+ passed = passed and col_passed
67
+
68
+ results_table.append(
72
69
  {
73
- "Column": result.column,
74
- "Skewness": result.values["skewness"],
75
- "Pass/Fail": "Pass" if result.passed else "Fail",
70
+ "Column": col,
71
+ "Skewness": col_skewness,
72
+ "Pass/Fail": "Pass" if col_passed else "Fail",
76
73
  }
77
- for result in results
78
- ]
79
- return ResultSummary(
80
- results=[
81
- ResultTable(
82
- data=results_table,
83
- metadata=ResultTableMetadata(title="Skewness Results for Dataset"),
84
- )
85
- ]
86
74
  )
87
75
 
88
- def run(self):
89
- typeset = ProfilingTypeSet(Settings())
90
- dataset_types = typeset.infer_type(self.inputs.dataset.df)
91
-
92
- skewness = self.inputs.dataset.df.skew(numeric_only=True)
93
-
94
- results = []
95
- passed = []
96
-
97
- for col in skewness.index:
98
- # Only calculate skewness for numerical columns
99
- if str(dataset_types[col]) != "Numeric":
100
- continue
101
-
102
- col_skewness = skewness[col]
103
- col_pass = abs(col_skewness) < self.params["max_threshold"]
104
- passed.append(col_pass)
105
- results.append(
106
- ThresholdTestResult(
107
- column=col,
108
- passed=col_pass,
109
- values={
110
- "skewness": col_skewness,
111
- },
112
- )
113
- )
114
-
115
- return self.cache_results(results, passed=all(passed))
76
+ return {
77
+ "Skewness Results for Dataset": results_table,
78
+ }, passed
@@ -3,12 +3,17 @@
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
5
  import matplotlib.pyplot as plt
6
+ import pandas as pd
6
7
  import seaborn as sns
7
8
 
8
- from validmind.vm_models import Figure, Metric
9
+ from validmind import tags, tasks
10
+ from validmind.errors import SkipTestError
11
+ from validmind.vm_models import VMDataset
9
12
 
10
13
 
11
- class SpreadPlot(Metric):
14
+ @tags("time_series_data", "visualization")
15
+ @tasks("regression")
16
+ def SpreadPlot(dataset: VMDataset):
12
17
  """
13
18
  Assesses potential correlations between pairs of time series variables through visualization to enhance
14
19
  understanding of their relationships.
@@ -51,66 +56,38 @@ class SpreadPlot(Metric):
51
56
  plots.
52
57
  - Might not completely capture intricate non-linear relationships between the variables.
53
58
  """
59
+ # Validate that the index is datetime
60
+ if not isinstance(dataset.df.index, pd.DatetimeIndex):
61
+ raise SkipTestError("Index must be a datetime type for time series analysis")
54
62
 
55
- name = "spread_plot"
56
- required_inputs = ["dataset"]
57
- tasks = ["regression"]
58
- tags = ["time_series_data", "visualization"]
63
+ df = dataset.df.dropna()
59
64
 
60
- @staticmethod
61
- def plot_spread(series1, series2, ax=None):
62
- """
63
- Plot the spread between two time series variables.
64
- :param series1: Pandas Series with time-series data for the first variable
65
- :param series2: Pandas Series with time-series data for the second variable
66
- :param ax: Axis object for the spread plot
67
- """
68
- spread = series1 - series2
65
+ # Get all unique pairs of feature columns
66
+ feature_pairs = [
67
+ (dataset.feature_columns[i], dataset.feature_columns[j])
68
+ for i in range(len(dataset.feature_columns))
69
+ for j in range(i + 1, len(dataset.feature_columns))
70
+ ]
69
71
 
70
- if ax is None:
71
- _, ax = plt.subplots()
72
+ figures = []
72
73
 
73
- sns.lineplot(data=spread, ax=ax)
74
+ for var1, var2 in feature_pairs:
75
+ fig, ax = plt.subplots()
76
+ fig.suptitle(
77
+ f"Spread between {var1} and {var2}",
78
+ fontsize=20,
79
+ weight="bold",
80
+ y=0.95,
81
+ )
74
82
 
75
- return ax
83
+ sns.lineplot(
84
+ data=df[var1] - df[var2],
85
+ ax=ax,
86
+ )
76
87
 
77
- def run(self):
78
- df = self.inputs.dataset.df.dropna()
88
+ ax.set_xlabel("")
89
+ ax.tick_params(axis="both", labelsize=18)
79
90
 
80
- figures = []
81
- columns = df.columns
82
- num_vars = len(columns)
91
+ figures.append(fig)
83
92
 
84
- for i in range(num_vars):
85
- for j in range(i + 1, num_vars):
86
- var1 = columns[i]
87
- var2 = columns[j]
88
-
89
- series1 = df[var1]
90
- series2 = df[var2]
91
-
92
- fig, ax = plt.subplots()
93
- fig.suptitle(
94
- f"Spread between {var1} and {var2}",
95
- fontsize=20,
96
- weight="bold",
97
- y=0.95,
98
- )
99
-
100
- self.plot_spread(series1, series2, ax=ax)
101
-
102
- ax.set_xlabel("")
103
- ax.tick_params(axis="both", labelsize=18)
104
-
105
- # Do this if you want to prevent the figure from being displayed
106
- plt.close("all")
107
-
108
- figures.append(
109
- Figure(
110
- for_object=self,
111
- key=f"{self.key}:{var1}_{var2}",
112
- figure=fig,
113
- )
114
- )
115
-
116
- return self.cache_results(figures=figures)
93
+ return tuple(figures)
@@ -2,13 +2,16 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import pandas as pd
6
5
  import plotly.graph_objs as go
7
6
 
8
- from validmind.vm_models import Figure, Metric
7
+ from validmind import tags, tasks
8
+ from validmind.errors import SkipTestError
9
+ from validmind.vm_models import VMDataset
9
10
 
10
11
 
11
- class TabularCategoricalBarPlots(Metric):
12
+ @tags("tabular_data", "visualization")
13
+ @tasks("classification", "regression")
14
+ def TabularCategoricalBarPlots(dataset: VMDataset):
12
15
  """
13
16
  Generates and visualizes bar plots for each category in categorical features to evaluate the dataset's composition.
14
17
 
@@ -46,67 +49,45 @@ class TabularCategoricalBarPlots(Metric):
46
49
  - Offers no insights into the model's performance or precision, but rather provides a descriptive analysis of the
47
50
  input.
48
51
  """
49
-
50
- name = "tabular_categorical_bar_plots"
51
- required_inputs = ["dataset"]
52
- tasks = ["classification", "regression"]
53
- tags = ["tabular_data", "visualization"]
54
-
55
- def run(self):
56
- df = self.inputs.dataset.df
57
-
58
- # Extract categorical columns from the dataset
59
- categorical_columns = df.select_dtypes(
60
- include=[object, pd.Categorical]
61
- ).columns.tolist()
62
-
63
- if len(categorical_columns) == 0:
64
- raise ValueError("No categorical columns found in the dataset")
65
-
66
- # Define a color sequence for the categories
67
- color_sequence = [
68
- "#636EFA",
69
- "#EF553B",
70
- "#00CC96",
71
- "#AB63FA",
72
- "#FFA15A",
73
- "#19D3F3",
74
- "#FF6692",
75
- "#B6E880",
76
- "#FF97FF",
77
- "#FECB52",
78
- ]
79
-
80
- figures = []
81
- for col in categorical_columns:
82
- counts = df[col].value_counts()
83
-
84
- fig = go.Figure()
85
- fig.add_trace(
86
- go.Bar(
87
- x=counts.index,
88
- y=counts.values,
89
- name=col,
90
- marker_color=color_sequence[: len(counts)],
91
- )
92
- ) # add colored bar plot trace
93
- fig.update_layout(
94
- title_text=f"{col}", # title of plot
95
- xaxis_title_text="", # xaxis label
96
- yaxis_title_text="", # yaxis label
97
- autosize=False,
98
- width=500,
99
- height=500,
100
- margin=dict(l=50, r=50, b=100, t=100, pad=4),
101
- )
102
- figures.append(
103
- Figure(
104
- for_object=self,
105
- key=f"{self.key}:{col}",
106
- figure=fig,
107
- )
52
+ if not dataset.feature_columns_categorical:
53
+ raise SkipTestError("No categorical columns found in the dataset")
54
+
55
+ color_sequence = [
56
+ "#636EFA",
57
+ "#EF553B",
58
+ "#00CC96",
59
+ "#AB63FA",
60
+ "#FFA15A",
61
+ "#19D3F3",
62
+ "#FF6692",
63
+ "#B6E880",
64
+ "#FF97FF",
65
+ "#FECB52",
66
+ ]
67
+
68
+ figures = []
69
+
70
+ for col in dataset.feature_columns_categorical:
71
+ counts = dataset.df[col].value_counts()
72
+
73
+ fig = go.Figure()
74
+ fig.add_trace(
75
+ go.Bar(
76
+ x=counts.index,
77
+ y=counts.values,
78
+ name=col,
79
+ marker_color=color_sequence[: len(counts)],
108
80
  )
109
-
110
- return self.cache_results(
111
- figures=figures,
112
81
  )
82
+ fig.update_layout(
83
+ title_text=f"{col}",
84
+ xaxis_title_text="",
85
+ yaxis_title_text="",
86
+ autosize=False,
87
+ width=500,
88
+ height=500,
89
+ margin=dict(l=50, r=50, b=100, t=100, pad=4),
90
+ )
91
+ figures.append(fig)
92
+
93
+ return tuple(figures)