validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -5,10 +5,14 @@
5
5
  import pandas as pd
6
6
  import plotly.graph_objects as go
7
7
 
8
- from validmind.vm_models import Figure, Metric
8
+ from validmind import tags, tasks
9
+ from validmind.errors import SkipTestError
10
+ from validmind.vm_models import VMDataset
9
11
 
10
12
 
11
- class TabularDateTimeHistograms(Metric):
13
+ @tags("time_series_data", "visualization")
14
+ @tasks("classification", "regression")
15
+ def TabularDateTimeHistograms(dataset: VMDataset):
12
16
  """
13
17
  Generates histograms to provide graphical insight into the distribution of time intervals in a model's datetime
14
18
  data.
@@ -52,46 +56,20 @@ class TabularDateTimeHistograms(Metric):
52
56
  - The test is only applicable to datasets containing datetime columns and will fail if such columns are unavailable.
53
57
  - The interpretation of the histograms relies heavily on the domain expertise and experience of the reviewer.
54
58
  """
55
-
56
- name = "tabular_datetime_histograms"
57
- required_inputs = ["dataset"]
58
-
59
- tasks = ["classification", "regression"]
60
- tags = ["time_series_data", "visualization"]
61
-
62
- def run(self):
63
- df = self.inputs.dataset.df
64
-
65
- # Check if the index is a datetime type
66
- if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
67
- raise ValueError("Index must be a datetime type")
68
-
69
- figures = []
70
-
71
- # Calculate the difference between consecutive dates in the index
72
- date_diffs = df.index.to_series().sort_values().diff().dt.days.dropna()
73
-
74
- # Filter out 0 values
75
- date_diffs = date_diffs[date_diffs != 0]
76
-
77
- # Create a histogram using Plotly
78
- fig = go.Figure()
79
- fig.add_trace(go.Histogram(x=date_diffs, nbinsx=30))
80
- fig.update_layout(
81
- title="Index",
82
- xaxis_title="Days Between Consecutive Dates",
83
- yaxis_title="Frequency",
84
- font=dict(size=18),
85
- )
86
-
87
- figures.append(
88
- Figure(
89
- for_object=self,
90
- key=f"{self.key}:index",
91
- figure=fig,
92
- )
93
- )
94
-
95
- return self.cache_results(
96
- figures=figures,
97
- )
59
+ df = dataset.df
60
+ if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
61
+ raise SkipTestError("Index must be a datetime type")
62
+
63
+ date_diffs = df.index.to_series().sort_values().diff().dt.days.dropna()
64
+ date_diffs = date_diffs[date_diffs != 0]
65
+
66
+ fig = go.Figure()
67
+ fig.add_trace(go.Histogram(x=date_diffs, nbinsx=30))
68
+ fig.update_layout(
69
+ title="Index",
70
+ xaxis_title="Days Between Consecutive Dates",
71
+ yaxis_title="Frequency",
72
+ font=dict(size=18),
73
+ )
74
+
75
+ return fig
@@ -2,13 +2,15 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import numpy as np
6
5
  import plotly.graph_objs as go
7
6
 
8
- from validmind.vm_models import Figure, Metric
7
+ from validmind import tags, tasks
8
+ from validmind.vm_models import VMDataset
9
9
 
10
10
 
11
- class TabularNumericalHistograms(Metric):
11
+ @tags("tabular_data", "visualization")
12
+ @tasks("classification", "regression")
13
+ def TabularNumericalHistograms(dataset: VMDataset):
12
14
  """
13
15
  Generates histograms for each numerical feature in a dataset to provide visual insights into data distribution and
14
16
  detect potential issues.
@@ -51,47 +53,26 @@ class TabularNumericalHistograms(Metric):
51
53
  - Does not provide any insight into how these features affect the output of the model; it is purely an input
52
54
  analysis tool.
53
55
  """
54
-
55
- name = "tabular_numerical_histograms"
56
- required_inputs = ["dataset"]
57
-
58
- tasks = ["classification", "regression"]
59
- tags = ["tabular_data", "visualization"]
60
-
61
- def run(self):
62
- df = self.inputs.dataset.df
63
-
64
- # Extract numerical columns from the dataset
65
- numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()
66
-
67
- if len(numerical_columns) == 0:
68
- raise ValueError("No numerical columns found in the dataset")
69
-
70
- figures = []
71
- for col in numerical_columns:
72
- fig = go.Figure()
73
- fig.add_trace(
74
- go.Histogram(x=df[col], nbinsx=50, name=col)
75
- ) # add histogram trace
76
- fig.update_layout(
77
- title_text=f"{col}", # title of plot
78
- xaxis_title_text="", # xaxis label
79
- yaxis_title_text="", # yaxis label
80
- bargap=0.2, # gap between bars of adjacent location coordinates
81
- bargroupgap=0.1, # gap between bars of the same location coordinates
82
- autosize=False,
83
- width=500,
84
- height=500,
85
- margin=dict(l=50, r=50, b=100, t=100, pad=4),
86
- )
87
- figures.append(
88
- Figure(
89
- for_object=self,
90
- key=f"{self.key}:{col}",
91
- figure=fig,
92
- )
93
- )
94
-
95
- return self.cache_results(
96
- figures=figures,
56
+ if len(dataset.feature_columns_numeric) == 0:
57
+ raise ValueError("No numerical columns found in the dataset")
58
+
59
+ df = dataset.df
60
+ figures = []
61
+
62
+ for col in dataset.feature_columns_numeric:
63
+ fig = go.Figure()
64
+ fig.add_trace(go.Histogram(x=df[col], nbinsx=50, name=col))
65
+ fig.update_layout(
66
+ title_text=f"{col}",
67
+ xaxis_title_text="",
68
+ yaxis_title_text="",
69
+ bargap=0.2,
70
+ bargroupgap=0.1,
71
+ autosize=False,
72
+ width=500,
73
+ height=500,
74
+ margin=dict(l=50, r=50, b=100, t=100, pad=4),
97
75
  )
76
+ figures.append(fig)
77
+
78
+ return tuple(figures)
@@ -2,13 +2,18 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
+ import numpy as np
5
6
  import plotly.graph_objs as go
6
7
  from plotly.subplots import make_subplots
7
8
 
8
- from validmind.vm_models import Figure, Metric
9
+ from validmind import tags, tasks
10
+ from validmind.errors import SkipTestError
11
+ from validmind.vm_models import VMDataset
9
12
 
10
13
 
11
- class TargetRateBarPlots(Metric):
14
+ @tags("tabular_data", "visualization", "categorical_data")
15
+ @tasks("classification")
16
+ def TargetRateBarPlots(dataset: VMDataset):
12
17
  """
13
18
  Generates bar plots visualizing the default rates of categorical features for a classification machine learning
14
19
  model.
@@ -43,107 +48,63 @@ class TargetRateBarPlots(Metric):
43
48
 
44
49
  ### Limitations
45
50
 
46
- - The test is less useful when dealing with numeric or continuous data, as it's designed specifically for
47
- categorical features.
48
- - If the model in question is dealing with a multi-class problem rather than binary classification, the test's
49
- assumption of binary target values (0s and 1s) becomes a significant limitation.
50
51
  - The readability of the bar plots drops as the number of distinct categories increases in the dataset, which can
51
52
  make them harder to understand and less useful.
52
53
  """
53
-
54
- name = "target_rate_bar_plots"
55
- required_inputs = ["dataset"]
56
- default_params = {"default_column": None, "columns": None}
57
- tasks = ["classification"]
58
- tags = ["tabular_data", "visualization", "categorical_data"]
59
-
60
- def plot_loan_default_ratio(self, default_column, columns=None):
61
- df = self.inputs.dataset.df
62
-
63
- # Use all categorical features if columns is not specified, else use selected columns
64
- if columns is None:
65
- features = self.inputs.dataset.feature_columns_categorical
66
- else:
67
- features = columns
68
-
69
- figures = []
70
- for feature in features:
71
- fig = make_subplots(
72
- rows=1,
73
- cols=2,
74
- )
75
-
76
- # Calculate counts and default rate for each category
77
- counts = df[feature].value_counts()
78
- default_rate = df.groupby(feature)[default_column].mean()
79
-
80
- # Left plot: Counts
81
- fig.add_trace(
82
- go.Bar(
83
- x=counts.index,
84
- y=counts.values,
85
- name="Counts",
86
- marker_color="#6699cc",
87
- ),
88
- row=1,
89
- col=1,
90
- )
91
-
92
- # Right plot: Default rate
93
- fig.add_trace(
94
- go.Bar(
95
- x=default_rate.index,
96
- y=default_rate.values,
97
- name="Target Rate",
98
- marker_color="orange",
99
- ),
100
- row=1,
101
- col=2,
102
- )
103
-
104
- fig.update_layout(
105
- title_text=f"{feature}", # title of plot
106
- autosize=False,
107
- width=500,
108
- height=400,
109
- margin=dict(l=50, r=50, b=100, t=100, pad=4),
110
- )
111
-
112
- figures.append(
113
- Figure(
114
- for_object=self,
115
- key=f"{self.key}:{feature}",
116
- figure=fig,
117
- )
118
- )
119
-
120
- return self.cache_results(
121
- figures=figures,
54
+ if np.unique(dataset.df[dataset.target_column]).size != 2:
55
+ raise SkipTestError(
56
+ f"Target column '{dataset.target_column}' is not binary. "
57
+ "This test only works for binary classification tasks."
122
58
  )
123
59
 
124
- def check_default_column(self, default_column):
125
- if default_column is None:
126
- raise ValueError("The default_column parameter needs to be specified.")
60
+ if len(dataset.feature_columns_categorical) == 0:
61
+ raise SkipTestError("No categorical columns found in the dataset")
127
62
 
128
- unique_values = self.inputs.dataset.df[default_column].unique()
129
- binary_values = [0, 1]
63
+ df = dataset.df
64
+ figures = []
130
65
 
131
- if sorted(unique_values) != binary_values:
132
- raise ValueError(
133
- f"The column {default_column} is not binary. It contains: {unique_values}"
134
- )
66
+ for col in dataset.feature_columns_categorical:
135
67
 
136
- print(f"The column {default_column} is correct and contains only 1 and 0.")
68
+ # Calculate counts and default rate for each category
69
+ counts = df[col].value_counts()
70
+ default_rate = df.groupby(col)[dataset.target_column].mean()
137
71
 
138
- def run(self):
139
- default_column = (
140
- self.params.get("default_column") or self.inputs.dataset.target_column
72
+ fig = make_subplots(
73
+ rows=1,
74
+ cols=2,
141
75
  )
142
- columns = self.params["columns"]
143
76
 
144
- # Check loan status variable has only 1 and 0
145
- self.check_default_column(default_column)
77
+ # Left plot: Counts
78
+ fig.add_trace(
79
+ go.Bar(
80
+ x=counts.index,
81
+ y=counts.values,
82
+ name="Counts",
83
+ marker_color="#6699cc",
84
+ ),
85
+ row=1,
86
+ col=1,
87
+ )
88
+ # Right plot: Default rate
89
+ fig.add_trace(
90
+ go.Bar(
91
+ x=default_rate.index,
92
+ y=default_rate.values,
93
+ name="Target Rate",
94
+ marker_color="orange",
95
+ ),
96
+ row=1,
97
+ col=2,
98
+ )
146
99
 
147
- return self.plot_loan_default_ratio(
148
- default_column=default_column, columns=columns
100
+ fig.update_layout(
101
+ title_text=col,
102
+ autosize=False,
103
+ width=500,
104
+ height=400,
105
+ margin=dict(l=50, r=50, b=100, t=100, pad=4),
149
106
  )
107
+
108
+ figures.append(fig)
109
+
110
+ return tuple(figures)
@@ -2,23 +2,17 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import pandas as pd
8
6
  import plotly.graph_objects as go
9
7
 
10
- from validmind.vm_models import (
11
- Figure,
12
- ResultSummary,
13
- ResultTable,
14
- ResultTableMetadata,
15
- ThresholdTest,
16
- ThresholdTestResult,
17
- )
8
+ from validmind import tags, tasks
9
+ from validmind.errors import SkipTestError
10
+ from validmind.vm_models import VMDataset
18
11
 
19
12
 
20
- @dataclass
21
- class TimeSeriesFrequency(ThresholdTest):
13
+ @tags("time_series_data")
14
+ @tasks("regression")
15
+ def TimeSeriesFrequency(dataset: VMDataset):
22
16
  """
23
17
  Evaluates consistency of time series data frequency and generates a frequency plot.
24
18
 
@@ -63,129 +57,50 @@ class TimeSeriesFrequency(ThresholdTest):
63
57
  - Depending on context or the model under development, mixed frequencies might sometimes be acceptable, but this
64
58
  test considers them a failing condition.
65
59
  """
66
-
67
- name = "time_series_frequency"
68
- required_inputs = ["dataset"]
69
- tasks = ["regression"]
70
- tags = ["time_series_data"]
71
-
72
- def summary(self, results, all_passed):
73
- """
74
- Example output:
75
- [
76
- {
77
- "values": {
78
- "Variable": ["MORTGAGE30US", "GS10", "FEDFUNDS"],
79
- "Frequency": ["Monthly", "Monthly", "Monthly"]
80
- },
81
- "passed": true
82
- }
83
- ]
84
- """
85
- first_result = results[0]
86
- variables = first_result.values["Variable"]
87
- frequencies = first_result.values["Frequency"]
88
-
89
- return ResultSummary(
90
- results=[
91
- ResultTable(
92
- data=pd.DataFrame(
93
- {"Variable": variables, "Frequency": frequencies}
94
- ),
95
- metadata=ResultTableMetadata(
96
- title="Frequency of Time Series Variables"
97
- ),
98
- )
99
- ]
100
- )
101
-
102
- def run(self):
103
- # Check if the index of dataframe is datetime
104
- is_datetime = pd.api.types.is_datetime64_any_dtype(self.inputs.dataset.df.index)
105
- if not is_datetime:
106
- raise ValueError("Dataset must be provided with datetime index")
107
-
108
- freq_df = self._identify_frequencies(self.inputs.dataset.df)
109
- n_frequencies = len(freq_df["Frequency"].unique())
110
- test_results = [
111
- ThresholdTestResult(
112
- passed=n_frequencies == 1,
113
- values=freq_df.to_dict(orient="list"),
114
- )
115
- ]
116
- fig_frequency = self._frequency_plot(self.inputs.dataset.df)
117
- test_figures = []
118
- test_figures.append(
119
- Figure(
120
- for_object=self,
121
- key=f"{self.name}:frequencyplot",
122
- figure=fig_frequency,
123
- metadata={"type": "frequencyplot"},
124
- )
125
- )
126
- return self.cache_results(
127
- test_results,
128
- passed=all([r.passed for r in test_results]),
129
- figures=test_figures,
130
- )
131
-
132
- def _identify_frequencies(self, df):
133
- """
134
- Identify the frequency of each series in the DataFrame.
135
- :param df: Time-series DataFrame
136
- :return: DataFrame with two columns: 'Variable' and 'Frequency'
137
- """
138
- frequencies = []
139
- freq_dict = {
140
- "S": "Second",
141
- "T": "Minute",
142
- "min": "Minute",
143
- "H": "Hourly",
144
- "D": "Daily",
145
- "B": "Business day",
146
- "W": "Weekly",
147
- "MS": "Monthly",
148
- "M": "Monthly",
149
- "Q": "Quarterly",
150
- "A": "Yearly",
151
- "Y": "Yearly",
152
- }
153
-
154
- for column in df.columns:
155
- series = df[column].dropna()
156
- if not series.empty:
157
- freq = pd.infer_freq(series.index)
158
- label = freq_dict.get(freq, freq)
159
- else:
160
- label = None
161
-
162
- frequencies.append({"Variable": column, "Frequency": label})
163
-
164
- freq_df = pd.DataFrame(frequencies)
165
-
166
- return freq_df
167
-
168
- def _frequency_plot(self, df):
169
- """
170
- Creates a frequency plot of time differences between consecutive entries in a DataFrame index using Plotly.
171
- Args:
172
- df (pandas.DataFrame): The input DataFrame.
173
- Returns:
174
- A Plotly Figure object representing the frequency plot of time differences.
175
- """
176
- # Calculate the time differences between consecutive entries
177
- time_diff = df.index.to_series().diff().dropna()
178
-
179
- # Convert the time differences to a suitable unit (e.g., days)
180
- time_diff_days = time_diff.dt.total_seconds() / (60 * 60 * 24)
181
-
182
- # Create a Plotly histogram
183
- fig = go.Figure(data=[go.Histogram(x=time_diff_days, nbinsx=50)])
184
- fig.update_layout(
60
+ df = dataset.df
61
+
62
+ if not pd.api.types.is_datetime64_any_dtype(df.index):
63
+ raise SkipTestError("Dataset must be provided with datetime index")
64
+
65
+ frequencies = []
66
+ freq_dict = {
67
+ "S": "Second",
68
+ "T": "Minute",
69
+ "min": "Minute",
70
+ "H": "Hourly",
71
+ "D": "Daily",
72
+ "B": "Business day",
73
+ "W": "Weekly",
74
+ "MS": "Monthly",
75
+ "M": "Monthly",
76
+ "Q": "Quarterly",
77
+ "A": "Yearly",
78
+ "Y": "Yearly",
79
+ }
80
+
81
+ for column in dataset.feature_columns_numeric:
82
+ series = df[column].dropna()
83
+ if not series.empty:
84
+ freq = pd.infer_freq(series.index)
85
+ label = freq_dict.get(freq, freq)
86
+ else:
87
+ label = None
88
+
89
+ frequencies.append({"Variable": column, "Frequency": label})
90
+
91
+ # Calculate the time differences between consecutive entries
92
+ time_diff = df.index.to_series().diff().dropna()
93
+ # Convert the time differences to a suitable unit (e.g., days)
94
+ time_diff_days = time_diff.dt.total_seconds() / (60 * 60 * 24)
95
+ # Plot the time differences as a histogram
96
+ fig = go.Figure(
97
+ data=[go.Histogram(x=time_diff_days, nbinsx=50)],
98
+ layout=go.Layout(
185
99
  title="Histogram of Time Differences (Days)",
186
100
  xaxis_title="Days",
187
101
  yaxis_title="Frequency",
188
102
  font=dict(size=16),
189
- )
103
+ ),
104
+ )
190
105
 
191
- return fig
106
+ return frequencies, fig, len(set(item["Frequency"] for item in frequencies)) == 1
@@ -2,12 +2,16 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
+ import pandas as pd
5
6
  import plotly.express as px
6
7
 
7
8
  from validmind import tags, tasks
9
+ from validmind.logging import get_logger
8
10
 
11
+ logger = get_logger(__name__)
9
12
 
10
- @tags("data_validation", "visualization")
13
+
14
+ @tags("data_validation", "visualization", "time_series_data")
11
15
  @tasks("regression", "time_series_forecasting")
12
16
  def TimeSeriesHistogram(dataset, nbins=30):
13
17
  """
@@ -51,6 +55,9 @@ def TimeSeriesHistogram(dataset, nbins=30):
51
55
 
52
56
  df = dataset.df
53
57
 
58
+ if not pd.api.types.is_datetime64_any_dtype(df.index):
59
+ raise ValueError(f"Dataset {dataset.input_id} must have a datetime index")
60
+
54
61
  columns = list(dataset.df.columns)
55
62
 
56
63
  if not set(columns).issubset(set(df.columns)):
@@ -58,12 +65,26 @@ def TimeSeriesHistogram(dataset, nbins=30):
58
65
 
59
66
  figures = []
60
67
  for col in columns:
68
+ # Check for missing values and log if any are found
69
+ missing_count = df[col].isna().sum()
70
+ if missing_count > 0:
71
+ logger.info(
72
+ f"Column '{col}' contains {missing_count} missing values which will be excluded from the histogram."
73
+ )
74
+
75
+ # Drop missing values for the current column
76
+ valid_data = df[~df[col].isna()]
77
+
61
78
  fig = px.histogram(
62
- df, x=col, marginal="violin", nbins=nbins, title=f"Histogram for {col}"
79
+ valid_data,
80
+ x=col,
81
+ marginal="violin",
82
+ nbins=nbins,
83
+ title=f"Histogram for {col}",
63
84
  )
64
85
  fig.update_layout(
65
86
  title={
66
- "text": f"{col}",
87
+ "text": f"{col} (n={len(valid_data)})",
67
88
  "y": 0.9,
68
89
  "x": 0.5,
69
90
  "xanchor": "center",