validmind 2.3.3__py3-none-any.whl → 2.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +8 -1
  3. validmind/ai/utils.py +2 -1
  4. validmind/client.py +1 -0
  5. validmind/datasets/regression/fred_timeseries.py +272 -0
  6. validmind/tests/__init__.py +14 -468
  7. validmind/tests/__types__.py +10 -0
  8. validmind/tests/_store.py +102 -0
  9. validmind/tests/data_validation/ACFandPACFPlot.py +7 -9
  10. validmind/tests/data_validation/ADF.py +8 -10
  11. validmind/tests/data_validation/ANOVAOneWayTable.py +8 -10
  12. validmind/tests/data_validation/AutoAR.py +2 -4
  13. validmind/tests/data_validation/AutoMA.py +2 -4
  14. validmind/tests/data_validation/AutoSeasonality.py +8 -10
  15. validmind/tests/data_validation/AutoStationarity.py +8 -10
  16. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +8 -10
  17. validmind/tests/data_validation/BivariateHistograms.py +8 -10
  18. validmind/tests/data_validation/BivariateScatterPlots.py +8 -10
  19. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +8 -10
  20. validmind/tests/data_validation/ClassImbalance.py +2 -4
  21. validmind/tests/data_validation/DFGLSArch.py +2 -4
  22. validmind/tests/data_validation/DatasetDescription.py +7 -9
  23. validmind/tests/data_validation/DatasetSplit.py +8 -9
  24. validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
  25. validmind/tests/data_validation/Duplicates.py +2 -4
  26. validmind/tests/data_validation/EngleGrangerCoint.py +2 -4
  27. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +2 -4
  28. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +2 -4
  29. validmind/tests/data_validation/HighCardinality.py +2 -4
  30. validmind/tests/data_validation/HighPearsonCorrelation.py +2 -4
  31. validmind/tests/data_validation/IQROutliersBarPlot.py +2 -4
  32. validmind/tests/data_validation/IQROutliersTable.py +2 -4
  33. validmind/tests/data_validation/IsolationForestOutliers.py +2 -4
  34. validmind/tests/data_validation/KPSS.py +8 -10
  35. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +2 -4
  36. validmind/tests/data_validation/MissingValues.py +2 -4
  37. validmind/tests/data_validation/MissingValuesBarPlot.py +2 -4
  38. validmind/tests/data_validation/MissingValuesRisk.py +2 -4
  39. validmind/tests/data_validation/PearsonCorrelationMatrix.py +2 -4
  40. validmind/tests/data_validation/PhillipsPerronArch.py +7 -9
  41. validmind/tests/data_validation/RollingStatsPlot.py +2 -4
  42. validmind/tests/data_validation/ScatterPlot.py +2 -4
  43. validmind/tests/data_validation/SeasonalDecompose.py +70 -44
  44. validmind/tests/data_validation/Skewness.py +2 -4
  45. validmind/tests/data_validation/SpreadPlot.py +2 -4
  46. validmind/tests/data_validation/TabularCategoricalBarPlots.py +2 -4
  47. validmind/tests/data_validation/TabularDateTimeHistograms.py +2 -4
  48. validmind/tests/data_validation/TabularDescriptionTables.py +2 -4
  49. validmind/tests/data_validation/TabularNumericalHistograms.py +2 -4
  50. validmind/tests/data_validation/TargetRateBarPlots.py +2 -4
  51. validmind/tests/data_validation/TimeSeriesDescription.py +74 -0
  52. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +76 -0
  53. validmind/tests/data_validation/TimeSeriesFrequency.py +2 -4
  54. validmind/tests/data_validation/TimeSeriesHistogram.py +29 -45
  55. validmind/tests/data_validation/TimeSeriesLinePlot.py +2 -4
  56. validmind/tests/data_validation/TimeSeriesMissingValues.py +2 -4
  57. validmind/tests/data_validation/TimeSeriesOutliers.py +32 -45
  58. validmind/tests/data_validation/TooManyZeroValues.py +2 -4
  59. validmind/tests/data_validation/UniqueRows.py +2 -4
  60. validmind/tests/data_validation/WOEBinPlots.py +2 -4
  61. validmind/tests/data_validation/WOEBinTable.py +2 -4
  62. validmind/tests/data_validation/ZivotAndrewsArch.py +2 -4
  63. validmind/tests/data_validation/nlp/CommonWords.py +2 -4
  64. validmind/tests/data_validation/nlp/Hashtags.py +2 -4
  65. validmind/tests/data_validation/nlp/Mentions.py +2 -4
  66. validmind/tests/data_validation/nlp/Punctuations.py +2 -4
  67. validmind/tests/data_validation/nlp/StopWords.py +2 -4
  68. validmind/tests/data_validation/nlp/TextDescription.py +2 -4
  69. validmind/tests/decorator.py +10 -8
  70. validmind/tests/load.py +264 -0
  71. validmind/tests/metadata.py +59 -0
  72. validmind/tests/model_validation/ClusterSizeDistribution.py +5 -7
  73. validmind/tests/model_validation/FeaturesAUC.py +6 -8
  74. validmind/tests/model_validation/ModelMetadata.py +8 -9
  75. validmind/tests/model_validation/ModelMetadataComparison.py +59 -0
  76. validmind/tests/model_validation/ModelPredictionResiduals.py +103 -0
  77. validmind/tests/model_validation/RegressionResidualsPlot.py +2 -6
  78. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +131 -0
  79. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +76 -0
  80. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +103 -0
  81. validmind/tests/model_validation/embeddings/ClusterDistribution.py +2 -4
  82. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +2 -4
  83. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +2 -4
  84. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +2 -4
  85. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +2 -4
  86. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +5 -7
  87. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +5 -7
  88. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +7 -9
  89. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +5 -7
  90. validmind/tests/model_validation/sklearn/ClusterPerformance.py +5 -7
  91. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +2 -7
  92. validmind/tests/model_validation/sklearn/CompletenessScore.py +5 -7
  93. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +19 -10
  94. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +83 -0
  95. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +5 -7
  96. validmind/tests/model_validation/sklearn/HomogeneityScore.py +5 -7
  97. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +2 -7
  98. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +4 -7
  99. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +7 -9
  100. validmind/tests/model_validation/sklearn/MinimumF1Score.py +7 -9
  101. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +7 -9
  102. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +8 -10
  103. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +7 -9
  104. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +9 -11
  105. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +7 -9
  106. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +8 -10
  107. validmind/tests/model_validation/sklearn/ROCCurve.py +10 -11
  108. validmind/tests/model_validation/sklearn/RegressionErrors.py +5 -7
  109. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +76 -0
  110. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +5 -7
  111. validmind/tests/model_validation/sklearn/RegressionR2Square.py +5 -7
  112. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +63 -0
  113. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +10 -14
  114. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +8 -10
  115. validmind/tests/model_validation/sklearn/SilhouettePlot.py +5 -7
  116. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +8 -10
  117. validmind/tests/model_validation/sklearn/VMeasure.py +5 -7
  118. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +8 -10
  119. validmind/tests/model_validation/statsmodels/AutoARIMA.py +2 -4
  120. validmind/tests/model_validation/statsmodels/BoxPierce.py +2 -4
  121. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +3 -4
  122. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +2 -4
  123. validmind/tests/model_validation/statsmodels/GINITable.py +2 -4
  124. validmind/tests/model_validation/statsmodels/JarqueBera.py +7 -9
  125. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +7 -9
  126. validmind/tests/model_validation/statsmodels/LJungBox.py +2 -4
  127. validmind/tests/model_validation/statsmodels/Lilliefors.py +7 -9
  128. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +2 -4
  129. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +2 -4
  130. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +7 -9
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +2 -4
  132. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -4
  133. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +2 -4
  134. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +2 -4
  135. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -4
  136. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +6 -8
  137. validmind/tests/model_validation/statsmodels/RunsTest.py +2 -4
  138. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +3 -4
  139. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +2 -4
  140. validmind/tests/prompt_validation/Bias.py +2 -4
  141. validmind/tests/prompt_validation/Clarity.py +2 -4
  142. validmind/tests/prompt_validation/Conciseness.py +2 -4
  143. validmind/tests/prompt_validation/Delimitation.py +2 -4
  144. validmind/tests/prompt_validation/NegativeInstruction.py +2 -4
  145. validmind/tests/prompt_validation/Robustness.py +2 -4
  146. validmind/tests/prompt_validation/Specificity.py +2 -4
  147. validmind/tests/run.py +394 -0
  148. validmind/tests/test_providers.py +12 -0
  149. validmind/tests/utils.py +16 -0
  150. validmind/unit_metrics/__init__.py +12 -4
  151. validmind/unit_metrics/composite.py +3 -0
  152. validmind/vm_models/test/metric.py +8 -5
  153. validmind/vm_models/test/result_wrapper.py +2 -1
  154. validmind/vm_models/test/test.py +14 -11
  155. validmind/vm_models/test/threshold_test.py +1 -0
  156. validmind/vm_models/test_suite/runner.py +1 -0
  157. {validmind-2.3.3.dist-info → validmind-2.4.0.dist-info}/METADATA +70 -36
  158. {validmind-2.3.3.dist-info → validmind-2.4.0.dist-info}/RECORD +162 -146
  159. /validmind/datasets/regression/datasets/{lending_club_loan_rates.csv → leanding_club_loan_rates.csv} +0 -0
  160. {validmind-2.3.3.dist-info → validmind-2.4.0.dist-info}/LICENSE +0 -0
  161. {validmind-2.3.3.dist-info → validmind-2.4.0.dist-info}/WHEEL +0 -0
  162. {validmind-2.3.3.dist-info → validmind-2.4.0.dist-info}/entry_points.txt +0 -0
@@ -4,10 +4,10 @@
4
4
 
5
5
  import warnings
6
6
 
7
- import matplotlib.pyplot as plt
8
7
  import numpy as np
9
8
  import pandas as pd
10
- import seaborn as sns
9
+ import plotly.graph_objects as go
10
+ from plotly.subplots import make_subplots
11
11
  from scipy import stats
12
12
  from statsmodels.tsa.seasonal import seasonal_decompose
13
13
 
@@ -59,10 +59,8 @@ class SeasonalDecompose(Metric):
59
59
  name = "seasonal_decompose"
60
60
  required_inputs = ["dataset"]
61
61
  default_params = {"seasonal_model": "additive"}
62
- metadata = {
63
- "task_types": ["regression"],
64
- "tags": ["time_series_data", "seasonality", "statsmodels"],
65
- }
62
+ tasks = ["regression"]
63
+ tags = ["time_series_data", "seasonality", "statsmodels"]
66
64
 
67
65
  def store_seasonal_decompose(self, column, sd_one_column):
68
66
  """
@@ -132,7 +130,6 @@ class SeasonalDecompose(Metric):
132
130
  inferred_freq = pd.infer_freq(series.index)
133
131
 
134
132
  if inferred_freq is not None:
135
- logger.info(f"Frequency of {col}: {inferred_freq}")
136
133
 
137
134
  # Only take finite values to seasonal_decompose
138
135
  sd = seasonal_decompose(
@@ -142,58 +139,87 @@ class SeasonalDecompose(Metric):
142
139
 
143
140
  results[col] = self.serialize_seasonal_decompose(sd)
144
141
 
145
- # Create subplots
146
- fig, axes = plt.subplots(3, 2)
147
- width, _ = fig.get_size_inches()
148
- fig.set_size_inches(width, 15)
149
- fig.subplots_adjust(hspace=0.3)
150
- fig.suptitle(
151
- f"Seasonal Decomposition for {col}",
152
- fontsize=20,
153
- weight="bold",
154
- y=0.95,
142
+ # Create subplots using Plotly
143
+ fig = make_subplots(
144
+ rows=3,
145
+ cols=2,
146
+ subplot_titles=(
147
+ "Observed",
148
+ "Trend",
149
+ "Seasonal",
150
+ "Residuals",
151
+ "Histogram and KDE of Residuals",
152
+ "Normal Q-Q Plot of Residuals",
153
+ ),
154
+ vertical_spacing=0.1,
155
155
  )
156
156
 
157
- # Original seasonal decomposition plots
158
157
  # Observed
159
- sd.observed.plot(ax=axes[0, 0])
160
- axes[0, 0].set_title("Observed", fontsize=18)
161
- axes[0, 0].set_xlabel("")
162
- axes[0, 0].tick_params(axis="both", labelsize=18)
158
+ fig.add_trace(
159
+ go.Scatter(x=sd.observed.index, y=sd.observed, name="Observed"),
160
+ row=1,
161
+ col=1,
162
+ )
163
163
 
164
164
  # Trend
165
- sd.trend.plot(ax=axes[0, 1])
166
- axes[0, 1].set_title("Trend", fontsize=18)
167
- axes[0, 1].set_xlabel("")
168
- axes[0, 1].tick_params(axis="both", labelsize=18)
165
+ fig.add_trace(
166
+ go.Scatter(x=sd.trend.index, y=sd.trend, name="Trend"),
167
+ row=1,
168
+ col=2,
169
+ )
169
170
 
170
171
  # Seasonal
171
- sd.seasonal.plot(ax=axes[1, 0])
172
- axes[1, 0].set_title("Seasonal", fontsize=18)
173
- axes[1, 0].set_xlabel("")
174
- axes[1, 0].tick_params(axis="both", labelsize=18)
172
+ fig.add_trace(
173
+ go.Scatter(x=sd.seasonal.index, y=sd.seasonal, name="Seasonal"),
174
+ row=2,
175
+ col=1,
176
+ )
175
177
 
176
178
  # Residuals
177
- sd.resid.plot(ax=axes[1, 1])
178
- axes[1, 1].set_title("Residuals", fontsize=18)
179
- axes[1, 1].set_xlabel("")
180
- axes[1, 1].tick_params(axis="both", labelsize=18)
179
+ fig.add_trace(
180
+ go.Scatter(x=sd.resid.index, y=sd.resid, name="Residuals"),
181
+ row=2,
182
+ col=2,
183
+ )
181
184
 
182
185
  # Histogram with KDE
183
186
  residuals = sd.resid.dropna()
184
- sns.histplot(residuals, kde=True, ax=axes[2, 0])
185
- axes[2, 0].set_title("Histogram and KDE of Residuals", fontsize=18)
186
- axes[2, 0].set_xlabel("")
187
- axes[2, 0].tick_params(axis="both", labelsize=18)
187
+ fig.add_trace(
188
+ go.Histogram(x=residuals, nbinsx=100, name="Residuals"),
189
+ row=3,
190
+ col=1,
191
+ )
188
192
 
189
193
  # Normal Q-Q plot
190
- stats.probplot(residuals, plot=axes[2, 1])
191
- axes[2, 1].set_title("Normal Q-Q Plot of Residuals", fontsize=18)
192
- axes[2, 1].set_xlabel("")
193
- axes[2, 1].tick_params(axis="both", labelsize=18)
194
+ qq = stats.probplot(residuals, plot=None)
195
+ qq_line_slope, qq_line_intercept = stats.linregress(
196
+ qq[0][0], qq[0][1]
197
+ )[:2]
198
+ qq_line = qq_line_slope * np.array(qq[0][0]) + qq_line_intercept
199
+
200
+ fig.add_trace(
201
+ go.Scatter(
202
+ x=qq[0][0], y=qq[0][1], mode="markers", name="QQ plot"
203
+ ),
204
+ row=3,
205
+ col=2,
206
+ )
207
+ fig.add_trace(
208
+ go.Scatter(
209
+ x=qq[0][0],
210
+ y=qq_line,
211
+ mode="lines",
212
+ name="QQ line",
213
+ ),
214
+ row=3,
215
+ col=2,
216
+ )
194
217
 
195
- # Do this if you want to prevent the figure from being displayed
196
- plt.close("all")
218
+ fig.update_layout(
219
+ height=1000,
220
+ title_text=f"Seasonal Decomposition for {col}",
221
+ showlegend=False,
222
+ )
197
223
 
198
224
  figures.append(
199
225
  Figure(
@@ -62,10 +62,8 @@ class Skewness(ThresholdTest):
62
62
  name = "skewness"
63
63
  required_inputs = ["dataset"]
64
64
  default_params = {"max_threshold": 1}
65
- metadata = {
66
- "task_types": ["classification", "regression"],
67
- "tags": ["tabular_data", "data_quality"],
68
- }
65
+ tasks = ["classification", "regression"]
66
+ tags = ["tabular_data", "data_quality"]
69
67
 
70
68
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
71
69
  """
@@ -54,10 +54,8 @@ class SpreadPlot(Metric):
54
54
 
55
55
  name = "spread_plot"
56
56
  required_inputs = ["dataset"]
57
- metadata = {
58
- "task_types": ["regression"],
59
- "tags": ["time_series_data", "visualization"],
60
- }
57
+ tasks = ["regression"]
58
+ tags = ["time_series_data", "visualization"]
61
59
 
62
60
  @staticmethod
63
61
  def plot_spread(series1, series2, ax=None):
@@ -43,10 +43,8 @@ class TabularCategoricalBarPlots(Metric):
43
43
 
44
44
  name = "tabular_categorical_bar_plots"
45
45
  required_inputs = ["dataset"]
46
- metadata = {
47
- "task_types": ["classification", "regression"],
48
- "tags": ["tabular_data", "visualization"],
49
- }
46
+ tasks = ["classification", "regression"]
47
+ tags = ["tabular_data", "visualization"]
50
48
 
51
49
  def run(self):
52
50
  df = self.inputs.dataset.df
@@ -48,10 +48,8 @@ class TabularDateTimeHistograms(Metric):
48
48
  name = "tabular_datetime_histograms"
49
49
  required_inputs = ["dataset"]
50
50
 
51
- metadata = {
52
- "task_types": ["classification", "regression"],
53
- "tags": ["time_series_data", "visualization"],
54
- }
51
+ tasks = ["classification", "regression"]
52
+ tags = ["time_series_data", "visualization"]
55
53
 
56
54
  def run(self):
57
55
  df = self.inputs.dataset.df
@@ -57,10 +57,8 @@ class TabularDescriptionTables(Metric):
57
57
  name = "tabular_description_tables"
58
58
  required_inputs = ["dataset"]
59
59
 
60
- metadata = {
61
- "task_types": ["classification", "regression"],
62
- "tags": ["tabular_data"],
63
- }
60
+ tasks = ["classification", "regression"]
61
+ tags = ["tabular_data"]
64
62
 
65
63
  def get_summary_statistics_numerical(self, numerical_fields):
66
64
  summary_stats = self.inputs.dataset.df[numerical_fields].describe().T
@@ -52,10 +52,8 @@ class TabularNumericalHistograms(Metric):
52
52
  name = "tabular_numerical_histograms"
53
53
  required_inputs = ["dataset"]
54
54
 
55
- metadata = {
56
- "task_types": ["classification", "regression"],
57
- "tags": ["tabular_data", "visualization"],
58
- }
55
+ tasks = ["classification", "regression"]
56
+ tags = ["tabular_data", "visualization"]
59
57
 
60
58
  def run(self):
61
59
  df = self.inputs.dataset.df
@@ -47,10 +47,8 @@ class TargetRateBarPlots(Metric):
47
47
  name = "target_rate_bar_plots"
48
48
  required_inputs = ["dataset"]
49
49
  default_params = {"default_column": None, "columns": None}
50
- metadata = {
51
- "task_types": ["classification"],
52
- "tags": ["tabular_data", "visualization", "categorical_data"],
53
- }
50
+ tasks = ["classification"]
51
+ tags = ["tabular_data", "visualization", "categorical_data"]
54
52
 
55
53
  def plot_loan_default_ratio(self, default_column, columns=None):
56
54
  df = self.inputs.dataset.df
@@ -0,0 +1,74 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import pandas as pd
6
+
7
+ from validmind import tags, tasks
8
+
9
+
10
+ @tags("time_series_data", "analysis")
11
+ @tasks("regression")
12
+ def TimeSeriesDescription(dataset):
13
+ """
14
+ Generates a detailed analysis for the provided time series dataset.
15
+
16
+ **Purpose**: The purpose of the TimeSeriesDescription function is to analyze an individual time series
17
+ by providing a summary of key statistics. This helps in understanding trends, patterns, and data quality issues
18
+ within the time series.
19
+
20
+ **Test Mechanism**: The function extracts the time series data and provides a summary of key statistics.
21
+ The dataset is expected to have a datetime index. The function checks this and raises an error if the index is
22
+ not in datetime format. For each variable (column) in the dataset, appropriate statistics including start date,
23
+ end date, frequency, number of missing values, count, min, and max values are calculated.
24
+
25
+ **Signs of High Risk**:
26
+ - If the index of the dataset is not in datetime format, it could lead to errors in time-series analysis.
27
+ - Inconsistent or missing data within the dataset might affect the analysis of trends and patterns.
28
+
29
+ **Strengths**:
30
+ - This function provides a comprehensive summary of key statistics for each variable, helping to identify data quality
31
+ issues such as missing values.
32
+ - The function helps in understanding the distribution and range of the data by including min and max values.
33
+
34
+ **Limitations**:
35
+ - This function assumes that the dataset is provided as a DataFrameDataset object with a .df attribute to access
36
+ the pandas DataFrame.
37
+ - It only analyzes datasets with a datetime index and will raise an error for other types of indices.
38
+ - The function does not handle large datasets efficiently, and performance may degrade with very large datasets.
39
+ """
40
+
41
+ summary = []
42
+
43
+ df = (
44
+ dataset.df
45
+ ) # Assuming DataFrameDataset objects have a .df attribute to get the pandas DataFrame
46
+
47
+ if not pd.api.types.is_datetime64_any_dtype(df.index):
48
+ raise ValueError(f"Dataset {dataset.input_id} must have a datetime index")
49
+
50
+ for column in df.columns:
51
+ start_date = df.index.min().strftime("%Y-%m-%d")
52
+ end_date = df.index.max().strftime("%Y-%m-%d")
53
+ frequency = pd.infer_freq(df.index)
54
+ num_missing_values = df[column].isna().sum()
55
+ count = df[column].count()
56
+ min_value = df[column].min()
57
+ max_value = df[column].max()
58
+
59
+ summary.append(
60
+ {
61
+ "Variable": column,
62
+ "Start Date": start_date,
63
+ "End Date": end_date,
64
+ "Frequency": frequency,
65
+ "Num of Missing Values": num_missing_values,
66
+ "Count": count,
67
+ "Min Value": min_value,
68
+ "Max Value": max_value,
69
+ }
70
+ )
71
+
72
+ result_df = pd.DataFrame(summary)
73
+
74
+ return result_df
@@ -0,0 +1,76 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import pandas as pd
6
+ from scipy.stats import kurtosis, skew
7
+
8
+ from validmind import tags, tasks
9
+
10
+
11
+ @tags("time_series_data", "analysis")
12
+ @tasks("regression")
13
+ def TimeSeriesDescriptiveStatistics(dataset):
14
+ """
15
+ Generates a detailed table of descriptive statistics for the provided time series dataset.
16
+
17
+ **Purpose**: The purpose of the TimeSeriesDescriptiveStatistics function is to analyze an individual time series
18
+ by providing a summary of key descriptive statistics. This helps in understanding trends, patterns, and data quality issues
19
+ within the time series.
20
+
21
+ **Test Mechanism**: The function extracts the time series data and provides a summary of key descriptive statistics.
22
+ The dataset is expected to have a datetime index. The function checks this and raises an error if the index is
23
+ not in datetime format. For each variable (column) in the dataset, appropriate statistics including start date,
24
+ end date, min, mean, max, skewness, kurtosis, and count are calculated.
25
+
26
+ **Signs of High Risk**:
27
+ - If the index of the dataset is not in datetime format, it could lead to errors in time-series analysis.
28
+ - Inconsistent or missing data within the dataset might affect the analysis of trends and patterns.
29
+
30
+ **Strengths**:
31
+ - This function provides a comprehensive summary of key descriptive statistics for each variable, helping to identify data quality
32
+ issues and understand the distribution of the data.
33
+
34
+ **Limitations**:
35
+ - This function assumes that the dataset is provided as a DataFrameDataset object with a .df attribute to access
36
+ the pandas DataFrame.
37
+ - It only analyzes datasets with a datetime index and will raise an error for other types of indices.
38
+ - The function does not handle large datasets efficiently, and performance may degrade with very large datasets.
39
+ """
40
+
41
+ summary = []
42
+
43
+ df = (
44
+ dataset.df
45
+ ) # Assuming DataFrameDataset objects have a .df attribute to get the pandas DataFrame
46
+
47
+ if not pd.api.types.is_datetime64_any_dtype(df.index):
48
+ raise ValueError(f"Dataset {dataset.input_id} must have a datetime index")
49
+
50
+ for column in df.columns:
51
+ start_date = df.index.min().strftime("%Y-%m-%d")
52
+ end_date = df.index.max().strftime("%Y-%m-%d")
53
+ count = df[column].count()
54
+ min_value = df[column].min()
55
+ mean_value = df[column].mean()
56
+ max_value = df[column].max()
57
+ skewness_value = skew(df[column].dropna())
58
+ kurtosis_value = kurtosis(df[column].dropna())
59
+
60
+ summary.append(
61
+ {
62
+ "Variable": column,
63
+ "Start Date": start_date,
64
+ "End Date": end_date,
65
+ "Min": min_value,
66
+ "Mean": mean_value,
67
+ "Max": max_value,
68
+ "Skewness": skewness_value,
69
+ "Kurtosis": kurtosis_value,
70
+ "Count": count,
71
+ }
72
+ )
73
+
74
+ result_df = pd.DataFrame(summary)
75
+
76
+ return result_df
@@ -59,10 +59,8 @@ class TimeSeriesFrequency(ThresholdTest):
59
59
 
60
60
  name = "time_series_frequency"
61
61
  required_inputs = ["dataset"]
62
- metadata = {
63
- "task_types": ["regression"],
64
- "tags": ["time_series_data"],
65
- }
62
+ tasks = ["regression"]
63
+ tags = ["time_series_data"]
66
64
 
67
65
  def summary(self, results, all_passed):
68
66
  """
@@ -2,14 +2,14 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import matplotlib.pyplot as plt
6
- import pandas as pd
7
- import seaborn as sns
5
+ import plotly.express as px
8
6
 
9
- from validmind.vm_models import Figure, Metric
7
+ from validmind import tags, tasks
10
8
 
11
9
 
12
- class TimeSeriesHistogram(Metric):
10
+ @tags("data_validation", "visualization")
11
+ @tasks("regression", "time_series_forecasting")
12
+ def TimeSeriesHistogram(dataset, nbins=30):
13
13
  """
14
14
  Visualizes distribution of time-series data using histograms and Kernel Density Estimation (KDE) lines.
15
15
 
@@ -20,7 +20,7 @@ class TimeSeriesHistogram(Metric):
20
20
  (kurtosis) underlying the data.
21
21
 
22
22
  **Test Mechanism**: This test operates on a specific column within the dataset that is required to have a datetime
23
- type index. It goes through each column in the given dataset, creating a histogram with Seaborn's histplot
23
+ type index. It goes through each column in the given dataset, creating a histogram with Plotly's histplot
24
24
  function. In cases where the dataset includes more than one time-series (i.e., more than one column with a datetime
25
25
  type index), a distinct histogram is plotted for each series. Additionally, a kernel density estimate (KDE) line is
26
26
  drawn for each histogram, providing a visualization of the data's underlying probability distribution. The x and
@@ -48,46 +48,30 @@ class TimeSeriesHistogram(Metric):
48
48
  - The histogram's shape may be sensitive to the number of bins used.
49
49
  """
50
50
 
51
- name = "time_series_histogram"
52
- required_inputs = ["dataset"]
53
- metadata = {
54
- "task_types": ["regression"],
55
- "tags": ["time_series_data", "visualization"],
56
- }
51
+ df = dataset.df
57
52
 
58
- def run(self):
59
- # Check if index is datetime
60
- if not pd.api.types.is_datetime64_any_dtype(self.inputs.dataset.df.index):
61
- raise ValueError("Index must be a datetime type")
53
+ columns = list(dataset.df.columns)
62
54
 
63
- columns = list(self.inputs.dataset.df.columns)
55
+ if not set(columns).issubset(set(df.columns)):
56
+ raise ValueError("Provided 'columns' must exist in the dataset")
64
57
 
65
- df = self.inputs.dataset.df
66
-
67
- if not set(columns).issubset(set(df.columns)):
68
- raise ValueError("Provided 'columns' must exist in the dataset")
69
-
70
- figures = []
71
- for col in columns:
72
- plt.figure()
73
- fig, _ = plt.subplots()
74
- ax = sns.histplot(data=df, x=col, kde=True)
75
- plt.title(f"Histogram for {col}", weight="bold", fontsize=20)
76
-
77
- plt.xticks(fontsize=18)
78
- plt.yticks(fontsize=18)
79
- ax.set_xlabel("")
80
- ax.set_ylabel("")
81
- figures.append(
82
- Figure(
83
- for_object=self,
84
- key=f"{self.key}:{col}",
85
- figure=fig,
86
- )
87
- )
88
-
89
- plt.close("all")
90
-
91
- return self.cache_results(
92
- figures=figures,
58
+ figures = []
59
+ for col in columns:
60
+ fig = px.histogram(
61
+ df, x=col, marginal="violin", nbins=nbins, title=f"Histogram for {col}"
62
+ )
63
+ fig.update_layout(
64
+ title={
65
+ "text": f"Histogram for {col}",
66
+ "y": 0.9,
67
+ "x": 0.5,
68
+ "xanchor": "center",
69
+ "yanchor": "top",
70
+ },
71
+ xaxis_title="",
72
+ yaxis_title="",
73
+ font=dict(size=18),
93
74
  )
75
+ figures.append(fig)
76
+
77
+ return tuple(figures)
@@ -47,10 +47,8 @@ class TimeSeriesLinePlot(Metric):
47
47
 
48
48
  name = "time_series_line_plot"
49
49
  required_inputs = ["dataset"]
50
- metadata = {
51
- "task_types": ["regression"],
52
- "tags": ["time_series_data", "visualization"],
53
- }
50
+ tasks = ["regression"]
51
+ tags = ["time_series_data", "visualization"]
54
52
 
55
53
  def run(self):
56
54
  # Check if index is datetime
@@ -65,10 +65,8 @@ class TimeSeriesMissingValues(ThresholdTest):
65
65
  name = "time_series_missing_values"
66
66
  required_inputs = ["dataset"]
67
67
  default_params = {"min_threshold": 1}
68
- metadata = {
69
- "task_types": ["regression"],
70
- "tags": ["time_series_data"],
71
- }
68
+ tasks = ["regression"]
69
+ tags = ["time_series_data"]
72
70
 
73
71
  def summary(self, results, all_passed):
74
72
  results_table = [