validmind 2.2.6__py3-none-any.whl → 2.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/{ai.py → ai/test_descriptions.py} +74 -82
  3. validmind/ai/utils.py +104 -0
  4. validmind/api_client.py +58 -19
  5. validmind/client.py +5 -5
  6. validmind/models/foundation.py +10 -6
  7. validmind/models/function.py +3 -1
  8. validmind/models/metadata.py +1 -1
  9. validmind/test_suites/__init__.py +1 -7
  10. validmind/test_suites/regression.py +0 -16
  11. validmind/test_suites/statsmodels_timeseries.py +1 -1
  12. validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
  13. validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
  14. validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
  15. validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
  16. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
  17. validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
  18. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  19. validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
  20. validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
  21. validmind/tests/data_validation/ScatterPlot.py +1 -1
  22. validmind/tests/data_validation/SeasonalDecompose.py +12 -7
  23. validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
  24. validmind/tests/data_validation/WOEBinPlots.py +1 -1
  25. validmind/tests/data_validation/WOEBinTable.py +1 -1
  26. validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
  27. validmind/tests/data_validation/nlp/CommonWords.py +1 -1
  28. validmind/tests/data_validation/nlp/Hashtags.py +1 -1
  29. validmind/tests/data_validation/nlp/Mentions.py +1 -1
  30. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
  31. validmind/tests/data_validation/nlp/Punctuations.py +1 -1
  32. validmind/tests/data_validation/nlp/Sentiment.py +1 -1
  33. validmind/tests/data_validation/nlp/TextDescription.py +5 -1
  34. validmind/tests/data_validation/nlp/Toxicity.py +1 -1
  35. validmind/tests/decorator.py +1 -1
  36. validmind/tests/model_validation/FeaturesAUC.py +5 -3
  37. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
  38. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
  39. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
  40. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
  41. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
  42. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
  43. validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
  44. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  45. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  46. validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
  47. validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
  48. validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
  49. validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
  50. validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
  51. validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
  52. validmind/tests/model_validation/ragas/utils.py +35 -9
  53. validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
  54. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
  55. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
  56. validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
  57. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
  58. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  59. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
  60. validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
  61. validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
  62. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
  63. validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
  64. validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
  65. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
  66. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
  67. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
  68. validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
  69. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
  70. validmind/tests/prompt_validation/Bias.py +14 -11
  71. validmind/tests/prompt_validation/Clarity.py +14 -11
  72. validmind/tests/prompt_validation/Conciseness.py +14 -11
  73. validmind/tests/prompt_validation/Delimitation.py +14 -11
  74. validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
  75. validmind/tests/prompt_validation/Robustness.py +11 -11
  76. validmind/tests/prompt_validation/Specificity.py +14 -11
  77. validmind/tests/prompt_validation/ai_powered_test.py +53 -75
  78. validmind/unit_metrics/composite.py +2 -1
  79. validmind/utils.py +4 -63
  80. validmind/vm_models/dataset/dataset.py +17 -3
  81. validmind/vm_models/dataset/utils.py +2 -2
  82. validmind/vm_models/model.py +1 -1
  83. validmind/vm_models/test/metric.py +1 -8
  84. validmind/vm_models/test/result_wrapper.py +2 -2
  85. validmind/vm_models/test/test.py +3 -0
  86. validmind/vm_models/test/threshold_test.py +1 -1
  87. validmind/vm_models/test_suite/runner.py +7 -4
  88. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/METADATA +1 -1
  89. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/RECORD +92 -101
  90. validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
  91. validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
  92. validmind/tests/data_validation/PiTPDHistogram.py +0 -152
  93. validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
  94. validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
  95. validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
  96. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
  97. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
  98. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
  99. validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
  100. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/LICENSE +0 -0
  101. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/WHEEL +0 -0
  102. {validmind-2.2.6.dist-info → validmind-2.3.1.dist-info}/entry_points.txt +0 -0
@@ -2,9 +2,9 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import matplotlib.pyplot as plt
6
5
  import pandas as pd
7
- from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
6
+ import plotly.graph_objects as go
7
+ from statsmodels.tsa.stattools import acf, pacf
8
8
 
9
9
  from validmind.vm_models import Figure, Metric
10
10
 
@@ -77,37 +77,46 @@ class ACFandPACFPlot(Metric):
77
77
  for col in df.columns:
78
78
  series = df[col]
79
79
 
80
- # Create subplots
81
- fig, (ax1, ax2) = plt.subplots(1, 2)
82
- width, _ = fig.get_size_inches()
83
- fig.set_size_inches(width, 5)
84
-
85
- plot_acf(series, ax=ax1)
86
- plot_pacf(series, ax=ax2)
87
-
88
- # Get the current y-axis limits
89
- ymin, ymax = ax1.get_ylim()
90
- # Set new limits - adding a bit of space
91
- ax1.set_ylim([ymin, ymax + 0.05 * (ymax - ymin)])
80
+ # Calculate the maximum number of lags based on the size of the dataset
81
+ max_lags = min(40, len(series) // 2 - 1)
82
+
83
+ # Calculate ACF and PACF values
84
+ acf_values = acf(series, nlags=max_lags)
85
+ pacf_values = pacf(series, nlags=max_lags)
86
+
87
+ # Create ACF plot using Plotly
88
+ acf_fig = go.Figure()
89
+ acf_fig.add_trace(go.Bar(x=list(range(len(acf_values))), y=acf_values))
90
+ acf_fig.update_layout(
91
+ title=f"ACF for {col}",
92
+ xaxis_title="Lag",
93
+ yaxis_title="ACF",
94
+ font=dict(size=18),
95
+ )
92
96
 
93
- ymin, ymax = ax2.get_ylim()
94
- ax2.set_ylim([ymin, ymax + 0.05 * (ymax - ymin)])
97
+ # Create PACF plot using Plotly
98
+ pacf_fig = go.Figure()
99
+ pacf_fig.add_trace(go.Bar(x=list(range(len(pacf_values))), y=pacf_values))
100
+ pacf_fig.update_layout(
101
+ title=f"PACF for {col}",
102
+ xaxis_title="Lag",
103
+ yaxis_title="PACF",
104
+ font=dict(size=18),
105
+ )
95
106
 
96
- ax1.tick_params(axis="both", labelsize=18)
97
- ax2.tick_params(axis="both", labelsize=18)
98
- ax1.set_title(f"ACF for {col}", weight="bold", fontsize=20)
99
- ax2.set_title(f"PACF for {col}", weight="bold", fontsize=20)
100
- ax1.set_xlabel("Lag", fontsize=18)
101
- ax2.set_xlabel("Lag", fontsize=18)
102
107
  figures.append(
103
108
  Figure(
104
109
  for_object=self,
105
- key=f"{self.key}:{col}",
106
- figure=fig,
110
+ key=f"{self.key}:{col}_acf",
111
+ figure=acf_fig,
112
+ )
113
+ )
114
+ figures.append(
115
+ Figure(
116
+ for_object=self,
117
+ key=f"{self.key}:{col}_pacf",
118
+ figure=pacf_fig,
107
119
  )
108
120
  )
109
-
110
- # Do this if you want to prevent the figure from being displayed
111
- plt.close("all")
112
121
 
113
122
  return self.cache_results(figures=figures)
@@ -2,12 +2,18 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from pandas import DataFrame
5
+ from dataclasses import dataclass
6
+
7
+ import pandas as pd
6
8
  from statsmodels.tsa.stattools import adfuller
7
9
 
10
+ from validmind.logging import get_logger
8
11
  from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
9
12
 
13
+ logger = get_logger(__name__)
14
+
10
15
 
16
+ @dataclass
11
17
  class ADF(Metric):
12
18
  """
13
19
  Assesses the stationarity of a time series dataset using the Augmented Dickey-Fuller (ADF) test.
@@ -53,7 +59,7 @@ class ADF(Metric):
53
59
  }
54
60
 
55
61
  def summary(self, metric_value: dict):
56
- table = DataFrame.from_dict(metric_value, orient="index")
62
+ table = pd.DataFrame.from_dict(metric_value, orient="index")
57
63
  table = table.reset_index()
58
64
  table.columns = [
59
65
  "Feature",
@@ -83,18 +89,41 @@ class ADF(Metric):
83
89
  """
84
90
  dataset = self.inputs.dataset.df
85
91
 
92
+ # Check if the dataset is a time series
93
+ if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
94
+ raise ValueError(
95
+ "Dataset index must be a datetime or period index for time series analysis."
96
+ )
97
+
98
+ # Preprocessing: Drop rows with any NaN values
99
+ if dataset.isnull().values.any():
100
+ logger.warning(
101
+ "Dataset contains missing values. Rows with NaNs will be dropped."
102
+ )
103
+ dataset = dataset.dropna()
104
+
86
105
  adf_values = {}
87
106
  for col in dataset.columns:
88
- adf, pvalue, usedlag, nobs, critical_values, icbest = adfuller(
89
- dataset[col].values
90
- )
91
- adf_values[col] = {
92
- "stat": adf,
93
- "pvalue": pvalue,
94
- "usedlag": usedlag,
95
- "nobs": nobs,
96
- "critical_values": critical_values,
97
- "icbest": icbest,
98
- }
107
+ try:
108
+ adf_result = adfuller(dataset[col].values)
109
+ adf_values[col] = {
110
+ "ADF Statistic": adf_result[0],
111
+ "P-Value": adf_result[1],
112
+ "Used Lag": adf_result[2],
113
+ "Number of Observations": adf_result[3],
114
+ "Critical Values": adf_result[4],
115
+ "IC Best": adf_result[5],
116
+ }
117
+ except Exception as e:
118
+ logger.error(f"Error processing column '{col}': {e}")
119
+ adf_values[col] = {
120
+ "ADF Statistic": None,
121
+ "P-Value": None,
122
+ "Used Lag": None,
123
+ "Number of Observations": None,
124
+ "Critical Values": None,
125
+ "IC Best": None,
126
+ "Error": str(e),
127
+ }
99
128
 
100
129
  return self.cache_results(adf_values)
@@ -2,10 +2,10 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
+ import itertools
5
6
  from dataclasses import dataclass
6
7
 
7
- import matplotlib.pyplot as plt
8
- import seaborn as sns
8
+ import plotly.express as px
9
9
 
10
10
  from validmind.vm_models import Figure, Metric
11
11
 
@@ -23,7 +23,7 @@ class BivariateScatterPlots(Metric):
23
23
  biases and irregularities in the data.
24
24
 
25
25
  **Test Mechanism**: This metric operates by creating a scatter plot for each pair of the selected features in the
26
- dataset. If the parameters "features_pairs" are not specified, an error will be thrown. The metric offers
26
+ dataset. If the parameters "selected_columns" are not specified, an error will be thrown. The metric offers
27
27
  flexibility by allowing the user to filter on a specific target class - specified by the "target_filter" parameter
28
28
  - for more granified insights. Each scatterplot is then color-coded based on the category of the target variable
29
29
  for better visual differentiation. The seaborn scatterplot library is used for generating the plots.
@@ -53,7 +53,7 @@ class BivariateScatterPlots(Metric):
53
53
 
54
54
  name = "bivariate_scatter_plots"
55
55
  required_inputs = ["dataset"]
56
- default_params = {"features_pairs": None, "target_filter": None}
56
+ default_params = {"selected_columns": None}
57
57
  metadata = {
58
58
  "task_types": ["classification"],
59
59
  "tags": [
@@ -65,52 +65,49 @@ class BivariateScatterPlots(Metric):
65
65
  ],
66
66
  }
67
67
 
68
- def plot_bivariate_scatter(self, features_pairs, target_filter):
69
- status_var = self.inputs.dataset.target_column
68
+ def plot_bivariate_scatter(self, columns):
70
69
  figures = []
71
- for x, y in features_pairs.items():
72
- df = self.inputs.dataset.df
73
- if target_filter is not None:
74
- df = df[df[status_var] == target_filter]
75
-
76
- plt.figure()
77
-
78
- # Scatterplot using seaborn, with color variation based on 'status_var'
79
- # Create color mapping with rgba values, last value is alpha (transparency)
80
- palette = {0: (0.8, 0.8, 0.8, 0.8), 1: "tab:red"}
81
- plot = sns.scatterplot(
82
- data=df, x=x, y=y, hue=status_var, palette=palette, alpha=1
70
+ df = self.inputs.dataset.df
71
+
72
+ # Generate all pairs of columns
73
+ features_pairs = list(itertools.combinations(columns, 2))
74
+
75
+ for x, y in features_pairs:
76
+ fig = px.scatter(
77
+ df,
78
+ x=x,
79
+ y=y,
80
+ title=f"{x} and {y}",
81
+ labels={x: x, y: y},
82
+ opacity=0.7,
83
+ color_discrete_sequence=["blue"], # Use the same color for all points
83
84
  )
84
-
85
- # Change legend labels
86
- legend_labels = [
87
- "Category 1" if t.get_text() == "1" else "Category 2"
88
- for t in plot.legend_.texts[1:]
89
- ]
90
- plot.legend_.texts[1:] = legend_labels
91
-
92
- plt.title(x + " and " + y)
93
- plt.xlabel(x)
94
- plt.ylabel(y)
95
- plt.show()
85
+ fig.update_traces(marker=dict(color="blue"))
96
86
 
97
87
  figures.append(
98
- Figure(for_object=self, key=f"{self.key}:{x}_{y}", figure=plt.figure())
88
+ Figure(for_object=self, key=f"{self.key}:{x}_{y}", figure=fig)
99
89
  )
100
90
 
101
- plt.close("all")
102
-
103
91
  return figures
104
92
 
105
93
  def run(self):
106
- features_pairs = self.params["features_pairs"]
107
- target_filter = self.params["target_filter"]
108
-
109
- if features_pairs is None:
110
- raise ValueError(
111
- "The features_pairs parameter is required for this metric."
112
- )
94
+ selected_columns = self.params["selected_columns"]
95
+
96
+ if selected_columns is None:
97
+ # Use all columns if selected_columns is not provided
98
+ selected_columns = self.inputs.dataset.df.columns.tolist()
99
+ else:
100
+ # Check if all selected columns exist in the dataframe
101
+ missing_columns = [
102
+ col
103
+ for col in selected_columns
104
+ if col not in self.inputs.dataset.df.columns
105
+ ]
106
+ if missing_columns:
107
+ raise ValueError(
108
+ f"The following selected columns are not in the dataframe: {missing_columns}"
109
+ )
113
110
 
114
- figures = self.plot_bivariate_scatter(features_pairs, target_filter)
111
+ figures = self.plot_bivariate_scatter(selected_columns)
115
112
 
116
113
  return self.cache_results(figures=figures)
@@ -4,9 +4,14 @@
4
4
 
5
5
  from dataclasses import dataclass
6
6
 
7
+ import pandas as pd
7
8
  from arch.unitroot import DFGLS
9
+ from numpy.linalg import LinAlgError
8
10
 
9
- from validmind.vm_models import Metric
11
+ from validmind.logging import get_logger
12
+ from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
13
+
14
+ logger = get_logger(__name__)
10
15
 
11
16
 
12
17
  @dataclass
@@ -59,14 +64,65 @@ class DFGLSArch(Metric):
59
64
  """
60
65
  dataset = self.inputs.dataset.df
61
66
 
62
- dfgls_values = {}
67
+ # Check if the dataset is a time series
68
+ if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
69
+ raise ValueError(
70
+ "Dataset index must be a datetime or period index for time series analysis."
71
+ )
72
+
73
+ # Preprocessing: Drop rows with any NaN values
74
+ if dataset.isnull().values.any():
75
+ logger.warning(
76
+ "Dataset contains missing values. Rows with NaNs will be dropped."
77
+ )
78
+ dataset = dataset.dropna()
79
+
80
+ # Convert to numeric and handle non-numeric data
81
+ dataset = dataset.apply(pd.to_numeric, errors="coerce")
82
+
83
+ # Initialize a list to store DFGLS results
84
+ dfgls_values = []
85
+
63
86
  for col in dataset.columns:
64
- dfgls_out = DFGLS(dataset[col].values)
65
- dfgls_values[col] = {
66
- "stat": dfgls_out.stat,
67
- "pvalue": dfgls_out.pvalue,
68
- "usedlag": dfgls_out.lags,
69
- "nobs": dfgls_out.nobs,
70
- }
71
-
72
- return self.cache_results(dfgls_values)
87
+ try:
88
+ dfgls_out = DFGLS(dataset[col].values)
89
+ dfgls_values.append(
90
+ {
91
+ "Variable": col,
92
+ "stat": dfgls_out.stat,
93
+ "pvalue": dfgls_out.pvalue,
94
+ "usedlag": dfgls_out.lags,
95
+ "nobs": dfgls_out.nobs,
96
+ }
97
+ )
98
+ except LinAlgError as e:
99
+ logger.error(
100
+ f"SVD did not converge while processing column '{col}'. This could be due to numerical instability or multicollinearity. Error details: {e}"
101
+ )
102
+ dfgls_values.append(
103
+ {
104
+ "Variable": col,
105
+ "stat": None,
106
+ "pvalue": None,
107
+ "usedlag": None,
108
+ "nobs": None,
109
+ "error": str(e),
110
+ }
111
+ )
112
+
113
+ return self.cache_results({"dfgls_results": dfgls_values})
114
+
115
+ def summary(self, metric_value):
116
+ """
117
+ Build a table for summarizing the DFGLS results
118
+ """
119
+ dfgls_results = metric_value["dfgls_results"]
120
+
121
+ return ResultSummary(
122
+ results=[
123
+ ResultTable(
124
+ data=dfgls_results,
125
+ metadata=ResultTableMetadata(title="DFGLS Test Results"),
126
+ )
127
+ ]
128
+ )
@@ -62,7 +62,7 @@ class HeatmapFeatureCorrelations(Metric):
62
62
  }
63
63
 
64
64
  def run(self):
65
- features = self.params["features"]
65
+ features = self.params.get("features")
66
66
  declutter = self.params.get("declutter", False)
67
67
  fontsize = self.params.get("fontsize", 13)
68
68
 
@@ -65,9 +65,18 @@ class HighPearsonCorrelation(ThresholdTest):
65
65
  }
66
66
 
67
67
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
68
- """
69
- The high pearson correlation test returns results like these:
70
- [{"values": {"correlations": [{"column": "NumOfProducts", "correlation": -0.3044645622389459}]}, "column": "Balance", "passed": false}]
68
+ """The high pearson correlation test returns results like these:
69
+ [
70
+ {
71
+ "values": {
72
+ "correlations": [
73
+ {"column": "NumOfProducts", "correlation": -0.3044645622389459}
74
+ ]
75
+ },
76
+ "column": "Balance",
77
+ "passed": false,
78
+ }
79
+ ]
71
80
  """
72
81
  results_table = [
73
82
  {
@@ -64,7 +64,7 @@ class IsolationForestOutliers(Metric):
64
64
 
65
65
  def run(self):
66
66
  if self.params["features_columns"] is None:
67
- features_list = self.inputs.dataset.feature_columns
67
+ features_list = self.inputs.dataset.feature_columns_numeric
68
68
  else:
69
69
  features_list = self.params["features_columns"]
70
70
 
@@ -78,7 +78,7 @@ class IsolationForestOutliers(Metric):
78
78
  + "training dataset feature columns"
79
79
  )
80
80
 
81
- dataset = self.inputs.dataset.df
81
+ dataset = self.inputs.dataset.df[features_list]
82
82
 
83
83
  # Training with isolation forest algorithm
84
84
  clf = IsolationForest(
@@ -4,9 +4,13 @@
4
4
 
5
5
  from dataclasses import dataclass
6
6
 
7
+ import pandas as pd
7
8
  from statsmodels.tsa.stattools import kpss
8
9
 
9
- from validmind.vm_models import Metric
10
+ from validmind.logging import get_logger
11
+ from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
12
+
13
+ logger = get_logger(__name__)
10
14
 
11
15
 
12
16
  @dataclass
@@ -64,14 +68,63 @@ class KPSS(Metric):
64
68
  """
65
69
  dataset = self.inputs.dataset.df
66
70
 
67
- kpss_values = {}
71
+ # Check if the dataset is a time series
72
+ if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
73
+ raise ValueError(
74
+ "Dataset index must be a datetime or period index for time series analysis."
75
+ )
76
+
77
+ # Preprocessing: Drop rows with any NaN values
78
+ if dataset.isnull().values.any():
79
+ logger.warning(
80
+ "Dataset contains missing values. Rows with NaNs will be dropped."
81
+ )
82
+ dataset = dataset.dropna()
83
+
84
+ # Convert to numeric and handle non-numeric data
85
+ dataset = dataset.apply(pd.to_numeric, errors="coerce")
86
+
87
+ # Initialize a list to store KPSS results
88
+ kpss_values = []
89
+
68
90
  for col in dataset.columns:
69
- kpss_stat, pvalue, usedlag, critical_values = kpss(dataset[col].values)
70
- kpss_values[col] = {
71
- "stat": kpss_stat,
72
- "pvalue": pvalue,
73
- "usedlag": usedlag,
74
- "critical_values": critical_values,
75
- }
76
-
77
- return self.cache_results(kpss_values)
91
+ try:
92
+ kpss_stat, pvalue, usedlag, critical_values = kpss(dataset[col].values)
93
+ kpss_values.append(
94
+ {
95
+ "Variable": col,
96
+ "stat": kpss_stat,
97
+ "pvalue": pvalue,
98
+ "usedlag": usedlag,
99
+ "critical_values": critical_values,
100
+ }
101
+ )
102
+ except Exception as e:
103
+ logger.error(f"Error processing column '{col}': {e}")
104
+ kpss_values.append(
105
+ {
106
+ "Variable": col,
107
+ "stat": None,
108
+ "pvalue": None,
109
+ "usedlag": None,
110
+ "critical_values": None,
111
+ "error": str(e),
112
+ }
113
+ )
114
+
115
+ return self.cache_results({"kpss_results": kpss_values})
116
+
117
+ def summary(self, metric_value):
118
+ """
119
+ Build a table for summarizing the KPSS results
120
+ """
121
+ kpss_results = metric_value["kpss_results"]
122
+
123
+ return ResultSummary(
124
+ results=[
125
+ ResultTable(
126
+ data=kpss_results,
127
+ metadata=ResultTableMetadata(title="KPSS Test Results"),
128
+ )
129
+ ]
130
+ )
@@ -4,9 +4,14 @@
4
4
 
5
5
  from dataclasses import dataclass
6
6
 
7
+ import pandas as pd
7
8
  from arch.unitroot import PhillipsPerron
9
+ from numpy.linalg import LinAlgError
8
10
 
9
- from validmind.vm_models import Metric
11
+ from validmind.logging import get_logger
12
+ from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
13
+
14
+ logger = get_logger(__name__)
10
15
 
11
16
 
12
17
  @dataclass
@@ -62,14 +67,63 @@ class PhillipsPerronArch(Metric):
62
67
  """
63
68
  dataset = self.inputs.dataset.df
64
69
 
65
- pp_values = {}
70
+ # Check if the dataset is a time series
71
+ if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
72
+ raise ValueError(
73
+ "Dataset index must be a datetime or period index for time series analysis."
74
+ )
75
+
76
+ # Preprocessing: Drop rows with any NaN values
77
+ if dataset.isnull().values.any():
78
+ logger.warning(
79
+ "Dataset contains missing values. Rows with NaNs will be dropped."
80
+ )
81
+ dataset = dataset.dropna()
82
+
83
+ # Convert to numeric and handle non-numeric data
84
+ dataset = dataset.apply(pd.to_numeric, errors="coerce")
85
+
86
+ # Initialize a list to store Phillips-Perron results
87
+ pp_values = []
88
+
66
89
  for col in dataset.columns:
67
- pp = PhillipsPerron(dataset[col].values)
68
- pp_values[col] = {
69
- "stat": pp.stat,
70
- "pvalue": pp.pvalue,
71
- "usedlag": pp.lags,
72
- "nobs": pp.nobs,
73
- }
74
-
75
- return self.cache_results(pp_values)
90
+ try:
91
+ pp = PhillipsPerron(dataset[col].values)
92
+ pp_values.append(
93
+ {
94
+ "Variable": col,
95
+ "stat": pp.stat,
96
+ "pvalue": pp.pvalue,
97
+ "usedlag": pp.lags,
98
+ "nobs": pp.nobs,
99
+ }
100
+ )
101
+ except LinAlgError as e:
102
+ logger.error(f"Error processing column '{col}': {e}")
103
+ pp_values.append(
104
+ {
105
+ "Variable": col,
106
+ "stat": None,
107
+ "pvalue": None,
108
+ "usedlag": None,
109
+ "nobs": None,
110
+ "error": str(e),
111
+ }
112
+ )
113
+
114
+ return self.cache_results({"phillips_perron_results": pp_values})
115
+
116
+ def summary(self, metric_value):
117
+ """
118
+ Build a table for summarizing the Phillips-Perron results
119
+ """
120
+ pp_results = metric_value["phillips_perron_results"]
121
+
122
+ return ResultSummary(
123
+ results=[
124
+ ResultTable(
125
+ data=pp_results,
126
+ metadata=ResultTableMetadata(title="Phillips-Perron Test Results"),
127
+ )
128
+ ]
129
+ )
@@ -51,7 +51,7 @@ class ScatterPlot(Metric):
51
51
  """
52
52
 
53
53
  name = "scatter_plot"
54
- required_inputs = ["dataset", "dataset.target_column"]
54
+ required_inputs = ["dataset"]
55
55
  metadata = {
56
56
  "task_types": ["classification", "regression"],
57
57
  "tags": ["tabular_data", "visualization"],
@@ -90,14 +90,18 @@ class SeasonalDecompose(Metric):
90
90
  dfs = [
91
91
  pd.DataFrame(series)
92
92
  .pipe(
93
- lambda x: x.reset_index()
94
- if not isinstance(x.index, pd.DatetimeIndex)
95
- else x.reset_index().rename(columns={x.index.name: "Date"})
93
+ lambda x: (
94
+ x.reset_index()
95
+ if not isinstance(x.index, pd.DatetimeIndex)
96
+ else x.reset_index().rename(columns={x.index.name: "Date"})
97
+ )
96
98
  )
97
99
  .assign(
98
- Date=lambda x: x["Date"].astype(str)
99
- if "Date" in x.columns
100
- else x.index.astype(str)
100
+ Date=lambda x: (
101
+ x["Date"].astype(str)
102
+ if "Date" in x.columns
103
+ else x.index.astype(str)
104
+ )
101
105
  )
102
106
  for series in results.values()
103
107
  ]
@@ -200,7 +204,8 @@ class SeasonalDecompose(Metric):
200
204
  )
201
205
  else:
202
206
  warnings.warn(
203
- f"No frequency could be inferred for variable '{col}'. Skipping seasonal decomposition and plots for this variable."
207
+ f"No frequency could be inferred for variable '{col}'. "
208
+ "Skipping seasonal decomposition and plots for this variable."
204
209
  )
205
210
 
206
211
  return self.cache_results(results, figures=figures)