validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -5,10 +5,14 @@
5
5
  import pandas as pd
6
6
  import plotly.graph_objects as go
7
7
 
8
- from validmind.vm_models import Figure, Metric
8
+ from validmind import tags, tasks
9
+ from validmind.errors import SkipTestError
10
+ from validmind.vm_models import VMDataset
9
11
 
10
12
 
11
- class TimeSeriesLinePlot(Metric):
13
+ @tags("time_series_data", "visualization")
14
+ @tasks("regression")
15
+ def TimeSeriesLinePlot(dataset: VMDataset):
12
16
  """
13
17
  Generates and analyses time-series data through line plots revealing trends, patterns, anomalies over time.
14
18
 
@@ -51,49 +55,27 @@ class TimeSeriesLinePlot(Metric):
51
55
  - The metric has an inherent limitation in that it cannot extract deeper statistical insights from the time series
52
56
  data, which can limit its efficacy with complex data structures and phenomena.
53
57
  """
54
-
55
- name = "time_series_line_plot"
56
- required_inputs = ["dataset"]
57
- tasks = ["regression"]
58
- tags = ["time_series_data", "visualization"]
59
-
60
- def run(self):
61
- # Check if index is datetime
62
- if not pd.api.types.is_datetime64_any_dtype(self.inputs.dataset.df.index):
63
- raise ValueError("Index must be a datetime type")
64
-
65
- columns = list(self.inputs.dataset.df.columns)
66
- df = self.inputs.dataset.df
67
-
68
- if not set(columns).issubset(set(df.columns)):
69
- raise ValueError("Provided 'columns' must exist in the dataset")
70
-
71
- figures = []
72
- for col in columns:
73
- # Creating the figure using Plotly
74
- fig = go.Figure()
75
-
76
- fig.add_trace(go.Scatter(x=df.index, y=df[col], mode="lines", name=col))
77
-
78
- fig.update_layout(
79
- title={
80
- "text": f"{col}",
81
- "y": 0.95,
82
- "x": 0.5,
83
- "xanchor": "center",
84
- "yanchor": "top",
85
- },
86
- font=dict(size=16),
87
- )
88
-
89
- figures.append(
90
- Figure(
91
- for_object=self,
92
- key=f"{self.key}:{col}",
93
- figure=fig,
94
- )
95
- )
96
-
97
- return self.cache_results(
98
- figures=figures,
58
+ df = dataset.df
59
+
60
+ if not pd.api.types.is_datetime64_any_dtype(df.index):
61
+ raise SkipTestError("Index must be a datetime type")
62
+
63
+ figures = []
64
+
65
+ for col in dataset.feature_columns_numeric:
66
+ fig = go.Figure()
67
+ fig.add_trace(go.Scatter(x=df.index, y=df[col], mode="lines", name=col))
68
+ fig.update_layout(
69
+ title={
70
+ "text": col,
71
+ "y": 0.95,
72
+ "x": 0.5,
73
+ "xanchor": "center",
74
+ "yanchor": "top",
75
+ },
76
+ font=dict(size=16),
99
77
  )
78
+
79
+ figures.append(fig)
80
+
81
+ return tuple(figures)
@@ -2,24 +2,18 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import pandas as pd
8
6
  import plotly.express as px
9
7
  import plotly.figure_factory as ff
10
8
 
11
- from validmind.vm_models import (
12
- Figure,
13
- ResultSummary,
14
- ResultTable,
15
- ResultTableMetadata,
16
- ThresholdTest,
17
- ThresholdTestResult,
18
- )
9
+ from validmind import tags, tasks
10
+ from validmind.errors import SkipTestError
11
+ from validmind.vm_models import VMDataset
19
12
 
20
13
 
21
- @dataclass
22
- class TimeSeriesMissingValues(ThresholdTest):
14
+ @tags("time_series_data")
15
+ @tasks("regression")
16
+ def TimeSeriesMissingValues(dataset: VMDataset, min_threshold: int = 1):
23
17
  """
24
18
  Validates time-series data quality by confirming the count of missing values is below a certain threshold.
25
19
 
@@ -37,17 +31,11 @@ class TimeSeriesMissingValues(ThresholdTest):
37
31
  dataset. An object for the test result is created stating whether the number of missing values is within the
38
32
  specified threshold. Additionally, the test calculates the percentage of missing values alongside the raw count.
39
33
 
40
- To aid in data visualization, the test generates two plots - a bar plot and a heatmap - to better illustrate the
41
- distribution and quantity of missing values per variable. The test results, including a count of missing values,
42
- the percentage of missing values, and a pass/fail status, are returned in a results table.
43
-
44
34
  ### Signs of High Risk
45
35
 
46
36
  - The number of missing values in any column of the dataset surpasses the threshold, marking a failure and a
47
37
  high-risk scenario. The reasons could range from incomplete data collection, faulty sensors to data preprocessing
48
38
  errors.
49
- - A continuous visual 'streak' in the heatmap may indicate a systematic error during data collection, pointing
50
- towards another potential risk source.
51
39
 
52
40
  ### Strengths
53
41
 
@@ -55,7 +43,6 @@ class TimeSeriesMissingValues(ThresholdTest):
55
43
  - Applicable and customizable through the threshold parameter across different data sets.
56
44
  - Goes beyond raw numbers by calculating the percentage of missing values, offering a more relative understanding
57
45
  of data scarcity.
58
- - Includes a robust visualization mechanism for easy and fast understanding of data quality.
59
46
 
60
47
  ### Limitations
61
48
 
@@ -66,124 +53,61 @@ class TimeSeriesMissingValues(ThresholdTest):
66
53
  overlook problematic data if set too loosely.
67
54
  - Solely focuses on the 'missingness' of the data and might fall short in addressing other aspects of data quality.
68
55
  """
56
+ df = dataset.df
57
+
58
+ if not pd.api.types.is_datetime64_any_dtype(df.index):
59
+ raise SkipTestError("Dataset must be provided with datetime index")
69
60
 
70
- name = "time_series_missing_values"
71
- required_inputs = ["dataset"]
72
- default_params = {"min_threshold": 1}
73
- tasks = ["regression"]
74
- tags = ["time_series_data"]
61
+ missing = df.isna().sum()
75
62
 
76
- def summary(self, results, all_passed):
77
- results_table = [
63
+ if sum(missing.values) == 0:
64
+ # if theres no missing values, no need to plot anything
65
+ return [
66
+ {
67
+ "Column": col,
68
+ "Number of Missing Values": missing[col],
69
+ "Percentage of Missing Values (%)": 0,
70
+ "Pass/Fail": "Pass",
71
+ }
72
+ for col in missing.index
73
+ ], True
74
+
75
+ barplot = px.bar(
76
+ missing,
77
+ x=missing.index,
78
+ y=missing.values,
79
+ labels={"x": "", "y": "Missing Values"},
80
+ title="Total Number of Missing Values per Variable",
81
+ color=missing.values,
82
+ color_continuous_scale="Reds",
83
+ )
84
+
85
+ missing_mask = df.isnull()
86
+ z = missing_mask.T.astype(int).values
87
+ x = missing_mask.index.tolist()
88
+ y = missing_mask.columns.tolist()
89
+ heatmap = ff.create_annotated_heatmap(
90
+ z=z,
91
+ x=x,
92
+ y=y,
93
+ colorscale="Reds",
94
+ showscale=False,
95
+ )
96
+
97
+ # Update the layout after creation
98
+ heatmap.update_layout(title="Missing Values Heatmap")
99
+
100
+ return (
101
+ [
78
102
  {
79
- "Column": result.column,
80
- "Number of Missing Values": result.values["n_missing"],
81
- "Percentage of Missing Values (%)": result.values["p_missing"] * 100,
82
- "Pass/Fail": "Pass" if result.passed else "Fail",
103
+ "Column": col,
104
+ "Number of Missing Values": missing[col],
105
+ "Percentage of Missing Values (%)": missing[col] / df.shape[0] * 100,
106
+ "Pass/Fail": "Pass" if missing[col] < min_threshold else "Fail",
83
107
  }
84
- for result in results
85
- ]
86
- return ResultSummary(
87
- results=[
88
- ResultTable(
89
- data=results_table,
90
- metadata=ResultTableMetadata(
91
- title="Missing Values Results for Dataset"
92
- ),
93
- )
94
- ]
95
- )
96
-
97
- def run(self):
98
- df = self.inputs.dataset.df
99
-
100
- # Check if the index of dataframe is datetime
101
- is_datetime = pd.api.types.is_datetime64_any_dtype(df.index)
102
- if not is_datetime:
103
- raise ValueError("Dataset must be provided with datetime index")
104
-
105
- # Validate threshold parameter
106
- if "min_threshold" not in self.params:
107
- raise ValueError("min_threshold must be provided in params")
108
- min_threshold = self.params["min_threshold"]
109
-
110
- rows = df.shape[0]
111
- missing = df.isna().sum()
112
- test_results = [
113
- ThresholdTestResult(
114
- column=col,
115
- passed=missing[col] < min_threshold,
116
- values={"n_missing": missing[col], "p_missing": missing[col] / rows},
117
- )
118
108
  for col in missing.index
119
- ]
120
-
121
- fig_barplot = self._barplot(df)
122
- fig_heatmap = self._heatmap(df)
123
- test_figures = []
124
- if fig_barplot is not None:
125
- test_figures.append(
126
- Figure(
127
- for_object=self,
128
- key=f"{self.name}:barplot",
129
- figure=fig_barplot,
130
- metadata={"type": "barplot"},
131
- )
132
- )
133
- test_figures.append(
134
- Figure(
135
- for_object=self,
136
- key=f"{self.name}:heatmap",
137
- figure=fig_heatmap,
138
- metadata={"type": "heatmap"},
139
- )
140
- )
141
-
142
- return self.cache_results(
143
- test_results,
144
- passed=all([r.passed for r in test_results]),
145
- # Don't pass figures until we figure out how to group metric-figures for multiple
146
- # executions inside a single test run
147
- # figures=test_figures,
148
- )
149
-
150
- def _barplot(self, df):
151
- """
152
- Generate a bar plot of missing values using Plotly.
153
- """
154
- missing_values = df.isnull().sum()
155
- if sum(missing_values.values) != 0:
156
- fig = px.bar(
157
- missing_values,
158
- x=missing_values.index,
159
- y=missing_values.values,
160
- labels={"x": "", "y": "Missing Values"},
161
- title="Total Number of Missing Values per Variable",
162
- color=missing_values.values,
163
- color_continuous_scale="Reds",
164
- )
165
- else:
166
- fig = None
167
-
168
- return fig
169
-
170
- def _heatmap(self, df):
171
- """
172
- Plots a heatmap to visualize missing values using Plotly.
173
- """
174
- # Create a boolean mask for missing values
175
- missing_mask = df.isnull()
176
- z = missing_mask.T.astype(int).values # Convert boolean to int for heatmap
177
-
178
- x = missing_mask.index.tolist()
179
- y = missing_mask.columns.tolist()
180
-
181
- if not x:
182
- fig = ff.create_annotated_heatmap(
183
- z=z, x=x, y=y, colorscale="Reds", showscale=False
184
- )
185
- fig.update_layout(title="Missing Values Heatmap")
186
- else:
187
- fig = None
188
-
189
- return fig
109
+ ],
110
+ barplot,
111
+ heatmap,
112
+ all(missing[col] < min_threshold for col in missing.index),
113
+ )
@@ -2,23 +2,17 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import pandas as pd
8
6
  import plotly.graph_objects as go
9
7
 
10
- from validmind.vm_models import (
11
- Figure,
12
- ResultSummary,
13
- ResultTable,
14
- ResultTableMetadata,
15
- ThresholdTest,
16
- ThresholdTestResult,
17
- )
8
+ from validmind import tags, tasks
9
+ from validmind.errors import SkipTestError
10
+ from validmind.vm_models import VMDataset
18
11
 
19
12
 
20
- @dataclass
21
- class TimeSeriesOutliers(ThresholdTest):
13
+ @tags("time_series_data")
14
+ @tasks("regression")
15
+ def TimeSeriesOutliers(dataset: VMDataset, zscore_threshold: int = 3):
22
16
  """
23
17
  Identifies and visualizes outliers in time-series data using the z-score method.
24
18
 
@@ -62,174 +56,63 @@ class TimeSeriesOutliers(ThresholdTest):
62
56
  - It does not address possible ways to handle identified outliers in the data.
63
57
  - The requirement for a datetime index could limit its application.
64
58
  """
65
-
66
- name = "time_series_outliers"
67
- required_inputs = ["dataset"]
68
- default_params = {"zscore_threshold": 3}
69
- tasks = ["regression"]
70
- tags = ["time_series_data"]
71
-
72
- def summary(self, results, all_passed: bool):
73
- """
74
- Example output:
75
- [
76
- {
77
- "values": {
78
- "Variable": [...],
79
- "z-score": [...],
80
- "Threshold": [3, 3, 3, 3, 3, 3],
81
- "Date": [...]
82
- },
83
- "test_name": "outliers",
84
- "passed": false
85
- }
86
- ]
87
- """
88
-
89
- first_result = results[0]
90
-
91
- variables = first_result.values["Variable"]
92
- zScores = first_result.values["z-score"]
93
- dates = first_result.values["Date"]
94
- passFail = [
95
- "Pass" if abs(z) < self.params["zscore_threshold"] else "Fail"
96
- for z in zScores
97
- ]
98
-
99
- return ResultSummary(
100
- results=[
101
- ResultTable(
102
- # Sort by variable and then by date
103
- data=pd.DataFrame(
104
- {
105
- "Variable": variables,
106
- "Date": dates,
107
- "z-Score": zScores,
108
- "Pass/Fail": passFail,
109
- }
110
- ).sort_values(["Variable", "Date"]),
111
- metadata=ResultTableMetadata(
112
- title="Outliers Results with z-Score Test"
113
- ),
59
+ df = dataset.df
60
+
61
+ if not pd.api.types.is_datetime64_any_dtype(df.index):
62
+ raise SkipTestError("Dataset must be provided with datetime index")
63
+
64
+ df_numeric = df[dataset.feature_columns_numeric]
65
+ z_scores = pd.DataFrame(
66
+ data=df_numeric.apply(lambda x: (x - x.mean()) / x.std()),
67
+ index=df.index,
68
+ columns=dataset.feature_columns_numeric,
69
+ )
70
+
71
+ outlier_table = []
72
+ outliers = z_scores[(z_scores.abs() > zscore_threshold).any(axis=1)]
73
+
74
+ for idx, row in outliers.iterrows():
75
+ for col in dataset.feature_columns_numeric:
76
+ if abs(row[col]) > zscore_threshold:
77
+ outlier_table.append(
78
+ {
79
+ "Column": col,
80
+ "Z-Score": row[col],
81
+ "Threshold": zscore_threshold,
82
+ "Date": idx.strftime("%Y-%m-%d"),
83
+ "Pass/Fail": "Fail",
84
+ }
114
85
  )
115
- ]
116
- )
117
-
118
- def run(self):
119
- # Initialize the test_results list
120
- test_results = []
121
-
122
- # Check if the index of dataframe is datetime
123
- is_datetime = pd.api.types.is_datetime64_any_dtype(self.inputs.dataset.df.index)
124
- if not is_datetime:
125
- raise ValueError("Dataset must be provided with datetime index")
126
86
 
127
- # Validate threshold parameter
128
- if "zscore_threshold" not in self.params:
129
- raise ValueError("zscore_threshold must be provided in params")
130
- zscore_threshold = self.params["zscore_threshold"]
87
+ outlier_df = pd.DataFrame(outlier_table)
88
+ figures = []
131
89
 
132
- temp_df = self.inputs.dataset.df.copy()
133
- # temp_df = temp_df.dropna()
90
+ for column in outlier_df["Column"].unique():
91
+ fig = go.Figure()
134
92
 
135
- # Infer numeric columns
136
- num_features_columns = temp_df.select_dtypes(
137
- include=["number"]
138
- ).columns.tolist()
139
-
140
- outliers_table = self.identify_outliers(
141
- temp_df[num_features_columns], zscore_threshold
93
+ fig.add_trace(
94
+ go.Scatter(x=df.index, y=df[column], mode="lines", name="Time Series")
142
95
  )
143
96
 
144
- test_figures = self._plot_outliers(temp_df, outliers_table)
145
- passed = outliers_table.empty
146
-
147
- if not outliers_table.empty:
148
- outliers_table["Date"] = outliers_table["Date"].astype(str)
149
-
150
- test_results.append(
151
- ThresholdTestResult(
152
- test_name="outliers",
153
- passed=passed,
154
- values=outliers_table.to_dict(orient="list"),
97
+ column_outliers = outlier_df[outlier_df["Column"] == column]
98
+ fig.add_trace(
99
+ go.Scatter(
100
+ x=pd.to_datetime(column_outliers["Date"]),
101
+ y=df.loc[pd.to_datetime(column_outliers["Date"]), column],
102
+ mode="markers",
103
+ marker=dict(color="red", size=10),
104
+ name="Outliers",
155
105
  )
156
106
  )
157
107
 
158
- return self.cache_results(test_results, passed=passed, figures=test_figures)
159
-
160
- def z_score_with_na(self, df):
161
- return df.apply(
162
- lambda x: (x - x.mean()) / x.std() if x.dtype.kind in "biufc" else x
108
+ fig.update_layout(
109
+ title=f"Outliers for {column}", xaxis_title="Date", yaxis_title=column
163
110
  )
164
111
 
165
- def identify_outliers(self, df, threshold):
166
- """
167
- Identifies and returns outliers in a pandas DataFrame using the z-score method.
168
- Args:
169
- df (pandas.DataFrame): A pandas DataFrame containing the data to be analyzed.
170
- threshold (float): The absolute value of the z-score above which a value is considered an outlier.
171
- Returns:
172
- pandas.DataFrame: A DataFrame containing the variables, z-scores, threshold, and dates of the identified outliers.
173
- """
174
- z_scores = pd.DataFrame(
175
- self.z_score_with_na(df), index=df.index, columns=df.columns
176
- )
177
-
178
- outliers = z_scores[(z_scores.abs() > threshold).any(axis=1)]
179
- outlier_table = []
180
- for idx, row in outliers.iterrows():
181
- for col in df.columns:
182
- if abs(row[col]) > threshold:
183
- outlier_table.append(
184
- {
185
- "Variable": col,
186
- "z-score": row[col],
187
- "Threshold": threshold,
188
- "Date": idx,
189
- }
190
- )
191
- return pd.DataFrame(outlier_table)
192
-
193
- def _plot_outliers(self, df, outliers_table):
194
- """
195
- Plots time series with identified outliers.
196
- Args:
197
- df (pandas.DataFrame): Input data with time series.
198
- outliers_table (pandas.DataFrame): DataFrame with identified outliers.
199
- Returns:
200
- list: A list of Figure objects with subplots for each variable.
201
- """
202
- figures = []
203
-
204
- for col in df.columns:
205
- fig = go.Figure()
206
-
207
- fig.add_trace(go.Scatter(x=df.index, y=df[col], mode="lines", name=col))
208
-
209
- if not outliers_table.empty:
210
- variable_outliers = outliers_table[outliers_table["Variable"] == col]
211
- fig.add_trace(
212
- go.Scatter(
213
- x=variable_outliers["Date"],
214
- y=df.loc[variable_outliers["Date"], col],
215
- mode="markers",
216
- marker=dict(color="red", size=10),
217
- name="Outlier",
218
- )
219
- )
220
-
221
- fig.update_layout(
222
- title=f"Outliers for {col}",
223
- xaxis_title="Date",
224
- yaxis_title=col,
225
- )
226
-
227
- figures.append(
228
- Figure(
229
- for_object=self,
230
- key=f"{self.name}:{col}_{self.inputs.dataset.input_id}",
231
- figure=fig,
232
- )
233
- )
112
+ figures.append(fig)
234
113
 
235
- return figures
114
+ return (
115
+ outlier_df.sort_values(["Column", "Date"]),
116
+ figures,
117
+ len(outlier_df) == 0,
118
+ )