validmind 2.2.5__py3-none-any.whl → 2.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/{ai.py → ai/test_descriptions.py} +127 -69
  3. validmind/ai/utils.py +104 -0
  4. validmind/api_client.py +70 -31
  5. validmind/client.py +5 -5
  6. validmind/logging.py +38 -32
  7. validmind/models/foundation.py +10 -6
  8. validmind/models/function.py +3 -1
  9. validmind/models/metadata.py +1 -1
  10. validmind/test_suites/__init__.py +1 -7
  11. validmind/test_suites/regression.py +0 -16
  12. validmind/test_suites/statsmodels_timeseries.py +1 -1
  13. validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
  14. validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
  15. validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
  16. validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
  17. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
  18. validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
  19. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  20. validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
  21. validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
  22. validmind/tests/data_validation/ScatterPlot.py +1 -1
  23. validmind/tests/data_validation/SeasonalDecompose.py +12 -7
  24. validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
  25. validmind/tests/data_validation/WOEBinPlots.py +1 -1
  26. validmind/tests/data_validation/WOEBinTable.py +1 -1
  27. validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
  28. validmind/tests/data_validation/nlp/CommonWords.py +1 -1
  29. validmind/tests/data_validation/nlp/Hashtags.py +1 -1
  30. validmind/tests/data_validation/nlp/Mentions.py +1 -1
  31. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
  32. validmind/tests/data_validation/nlp/Punctuations.py +1 -1
  33. validmind/tests/data_validation/nlp/Sentiment.py +1 -1
  34. validmind/tests/data_validation/nlp/TextDescription.py +5 -1
  35. validmind/tests/data_validation/nlp/Toxicity.py +1 -1
  36. validmind/tests/decorator.py +1 -1
  37. validmind/tests/model_validation/FeaturesAUC.py +5 -3
  38. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
  39. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
  40. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
  41. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
  42. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
  43. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
  44. validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
  45. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  46. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  47. validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
  48. validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
  49. validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
  50. validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
  51. validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
  52. validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
  53. validmind/tests/model_validation/ragas/utils.py +35 -9
  54. validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
  55. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
  56. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
  57. validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
  58. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
  59. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  60. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
  61. validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
  62. validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
  63. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
  64. validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
  65. validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
  66. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
  67. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
  68. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
  69. validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
  70. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
  71. validmind/tests/prompt_validation/Bias.py +14 -11
  72. validmind/tests/prompt_validation/Clarity.py +14 -11
  73. validmind/tests/prompt_validation/Conciseness.py +14 -11
  74. validmind/tests/prompt_validation/Delimitation.py +14 -11
  75. validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
  76. validmind/tests/prompt_validation/Robustness.py +11 -11
  77. validmind/tests/prompt_validation/Specificity.py +14 -11
  78. validmind/tests/prompt_validation/ai_powered_test.py +53 -75
  79. validmind/unit_metrics/composite.py +2 -1
  80. validmind/utils.py +4 -49
  81. validmind/vm_models/dataset/dataset.py +17 -3
  82. validmind/vm_models/dataset/utils.py +2 -2
  83. validmind/vm_models/model.py +1 -1
  84. validmind/vm_models/test/metric.py +1 -8
  85. validmind/vm_models/test/result_wrapper.py +27 -34
  86. validmind/vm_models/test/test.py +3 -0
  87. validmind/vm_models/test/threshold_test.py +1 -1
  88. validmind/vm_models/test_suite/runner.py +12 -6
  89. validmind/vm_models/test_suite/summary.py +18 -7
  90. validmind/vm_models/test_suite/test.py +13 -20
  91. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/METADATA +1 -1
  92. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/RECORD +95 -104
  93. validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
  94. validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
  95. validmind/tests/data_validation/PiTPDHistogram.py +0 -152
  96. validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
  97. validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
  98. validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
  99. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
  100. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
  101. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
  102. validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
  103. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/LICENSE +0 -0
  104. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/WHEEL +0 -0
  105. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/entry_points.txt +0 -0
validmind/logging.py CHANGED
@@ -13,22 +13,45 @@ from sentry_sdk.utils import event_from_exception, exc_info_from_error
13
13
 
14
14
  from .__version__ import __version__
15
15
 
16
- __log_level = None
17
16
  __dsn = "https://48f446843657444aa1e2c0d716ef864b@o1241367.ingest.sentry.io/4505239625465856"
18
17
 
19
18
 
20
19
  def _get_log_level():
21
- """Get the log level from the environment variable if not already set"""
22
- if __log_level is not None:
23
- return __log_level
20
+ """Get the log level from the environment variable"""
21
+ log_level_str = os.getenv("LOG_LEVEL", "INFO").upper()
24
22
 
25
- log_level_str = os.environ.get("LOG_LEVEL", "INFO").upper()
26
23
  if log_level_str not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
27
24
  raise ValueError(f"Invalid log level: {log_level_str}")
28
25
 
29
26
  return logging.getLevelName(log_level_str)
30
27
 
31
28
 
29
+ def get_logger(name="validmind", log_level=None):
30
+ """Get a logger for the given module name"""
31
+ formatter = logging.Formatter(
32
+ fmt="%(asctime)s - %(levelname)s(%(name)s): %(message)s"
33
+ )
34
+
35
+ handler = logging.StreamHandler()
36
+ handler.setFormatter(formatter)
37
+
38
+ logger = logging.getLogger(name)
39
+ logger.setLevel(log_level or _get_log_level())
40
+
41
+ # Clear existing handlers if any (or refine the existing logic as necessary)
42
+ # TODO: move this to a yaml config and only configure once
43
+ if not any(
44
+ isinstance(h, type(handler)) and h.formatter._fmt == formatter._fmt
45
+ for h in logger.handlers
46
+ ):
47
+ logger.addHandler(handler)
48
+
49
+ # Prevent logger from propagating to root logger
50
+ logger.propagate = False
51
+
52
+ return logger
53
+
54
+
32
55
  def init_sentry(server_config):
33
56
  """Initialize Sentry SDK for sending logs back to ValidMind
34
57
 
@@ -42,7 +65,10 @@ def init_sentry(server_config):
42
65
  - dsn (str): The Sentry DSN
43
66
  ...: Other config options for Sentry
44
67
  """
45
- if server_config.get("send_logs", False) is False:
68
+ if os.getenv("VM_NO_TELEMETRY", False):
69
+ return
70
+
71
+ if not server_config.get("send_logs", False):
46
72
  return
47
73
 
48
74
  config = {
@@ -53,33 +79,13 @@ def init_sentry(server_config):
53
79
  "environment": "production",
54
80
  }
55
81
  config.update({k: v for k, v in server_config.items() if k != "send_logs"})
56
- sentry_sdk.init(**config)
57
-
58
-
59
- def get_logger(name="validmind", log_level=None):
60
- """Get a logger for the given name"""
61
- formatter = logging.Formatter(
62
- fmt="%(asctime)s - %(levelname)s(%(name)s): %(message)s"
63
- )
64
-
65
- handler = logging.StreamHandler()
66
- handler.setFormatter(formatter)
67
-
68
- logger = logging.getLogger(name)
69
- logger.setLevel(log_level or _get_log_level())
70
-
71
- # Clear existing handlers if any (or refine the existing logic as necessary)
72
- # TODO: lets add some better handler management
73
- if not any(
74
- isinstance(h, type(handler)) and h.formatter._fmt == formatter._fmt
75
- for h in logger.handlers
76
- ):
77
- logger.addHandler(handler)
78
-
79
- # Prevent logger from propagating to root logger
80
- logger.propagate = False
81
82
 
82
- return logger
83
+ try:
84
+ sentry_sdk.init(**config)
85
+ except Exception as e:
86
+ logger = get_logger(__name__)
87
+ logger.info("Sentry failed to initialize - ignoring...")
88
+ logger.debug(f"Sentry error: {str(e)}")
83
89
 
84
90
 
85
91
  def log_performance(func, name=None, logger=None, force=False):
@@ -15,7 +15,7 @@ logger = get_logger(__name__)
15
15
  @dataclass
16
16
  class Prompt:
17
17
  template: str
18
- variables: list
18
+ variables: list = None
19
19
 
20
20
 
21
21
  class FoundationModel(FunctionModel):
@@ -33,17 +33,21 @@ class FoundationModel(FunctionModel):
33
33
  """
34
34
 
35
35
  def __post_init__(self):
36
- if not getattr(self, "predict_fn") or not callable(self.predict_fn):
37
- raise ValueError("FoundationModel requires a callable predict_fn")
36
+ super().__post_init__()
38
37
 
39
- self.name = self.name or self.predict_fn.__name__
38
+ if not hasattr(self, "prompt") or not isinstance(self.prompt, Prompt):
39
+ raise ValueError("FoundationModel requires a Prompt object")
40
40
 
41
41
  def _build_prompt(self, x: pd.DataFrame):
42
42
  """
43
43
  Builds the prompt for the model
44
44
  """
45
- return self.prompt.template.format(
46
- **{key: x[key] for key in self.prompt.variables}
45
+ return (
46
+ self.prompt.template.format(
47
+ **{key: x[key] for key in self.prompt.variables}
48
+ )
49
+ if self.prompt.variables
50
+ else self.prompt.template
47
51
  )
48
52
 
49
53
  def predict(self, X: pd.DataFrame):
@@ -31,10 +31,12 @@ class FunctionModel(VMModel):
31
31
  input features and return a prediction.
32
32
  input_id (str, optional): The input ID for the model. Defaults to None.
33
33
  name (str, optional): The name of the model. Defaults to the name of the predict_fn.
34
+ prompt (Prompt, optional): If using a prompt, the prompt object that defines the template
35
+ and the variables (if any). Defaults to None.
34
36
  """
35
37
 
36
38
  def __post_init__(self):
37
- if not getattr(self, "predict_fn") or not callable(self.predict_fn):
39
+ if not hasattr(self, "predict_fn") or not callable(self.predict_fn):
38
40
  raise ValueError("FunctionModel requires a callable predict_fn")
39
41
 
40
42
  self.name = self.name or self.predict_fn.__name__
@@ -24,7 +24,7 @@ class MetadataModel(VMModel):
24
24
  """
25
25
 
26
26
  def __post_init__(self):
27
- if not getattr(self, "attributes"):
27
+ if not hasattr(self, "attributes"):
28
28
  raise ValueError("MetadataModel requires attributes")
29
29
 
30
30
  self.name = self.name or "Metadata Model"
@@ -25,12 +25,7 @@ from .embeddings import EmbeddingsFullSuite, EmbeddingsMetrics, EmbeddingsPerfor
25
25
  from .llm import LLMClassifierFullSuite, PromptValidation
26
26
  from .nlp import NLPClassifierFullSuite
27
27
  from .parameters_optimization import KmeansParametersOptimization
28
- from .regression import (
29
- RegressionFullSuite,
30
- RegressionMetrics,
31
- RegressionModelsComparison,
32
- RegressionPerformance,
33
- )
28
+ from .regression import RegressionFullSuite, RegressionMetrics, RegressionPerformance
34
29
  from .statsmodels_timeseries import (
35
30
  RegressionModelDescription,
36
31
  RegressionModelsEvaluation,
@@ -72,7 +67,6 @@ core_test_suites = {
72
67
  RegressionMetrics.suite_id: RegressionMetrics,
73
68
  RegressionModelDescription.suite_id: RegressionModelDescription,
74
69
  RegressionModelsEvaluation.suite_id: RegressionModelsEvaluation,
75
- RegressionModelsComparison.suite_id: RegressionModelsComparison,
76
70
  RegressionFullSuite.suite_id: RegressionFullSuite,
77
71
  RegressionPerformance.suite_id: RegressionPerformance,
78
72
  SummarizationMetrics.suite_id: SummarizationMetrics,
@@ -32,17 +32,6 @@ class RegressionPerformance(TestSuite):
32
32
  ]
33
33
 
34
34
 
35
- class RegressionModelsComparison(TestSuite):
36
- """
37
- Test suite for regression models performance comparison
38
- """
39
-
40
- suite_id = "regression_models_comparison"
41
- tests = [
42
- "validmind.model_validation.sklearn.RegressionModelsPerformanceComparison",
43
- ]
44
-
45
-
46
35
  class RegressionFullSuite(TestSuite):
47
36
  """
48
37
  Full test suite for regression models.
@@ -70,9 +59,4 @@ class RegressionFullSuite(TestSuite):
70
59
  "section_description": RegressionPerformance.__doc__,
71
60
  "section_tests": RegressionPerformance.tests,
72
61
  },
73
- {
74
- "section_id": RegressionModelsComparison.suite_id,
75
- "section_description": RegressionModelsComparison.__doc__,
76
- "section_tests": RegressionModelsComparison.tests,
77
- },
78
62
  ]
@@ -29,5 +29,5 @@ class RegressionModelsEvaluation(TestSuite):
29
29
  suite_id = "regression_models_evaluation"
30
30
  tests = [
31
31
  "validmind.model_validation.statsmodels.RegressionModelsCoeffs",
32
- "validmind.model_validation.statsmodels.RegressionModelsPerformance",
32
+ "validmind.model_validation.sklearn.RegressionModelsPerformanceComparison",
33
33
  ]
@@ -2,9 +2,9 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import matplotlib.pyplot as plt
6
5
  import pandas as pd
7
- from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
6
+ import plotly.graph_objects as go
7
+ from statsmodels.tsa.stattools import acf, pacf
8
8
 
9
9
  from validmind.vm_models import Figure, Metric
10
10
 
@@ -77,37 +77,46 @@ class ACFandPACFPlot(Metric):
77
77
  for col in df.columns:
78
78
  series = df[col]
79
79
 
80
- # Create subplots
81
- fig, (ax1, ax2) = plt.subplots(1, 2)
82
- width, _ = fig.get_size_inches()
83
- fig.set_size_inches(width, 5)
84
-
85
- plot_acf(series, ax=ax1)
86
- plot_pacf(series, ax=ax2)
87
-
88
- # Get the current y-axis limits
89
- ymin, ymax = ax1.get_ylim()
90
- # Set new limits - adding a bit of space
91
- ax1.set_ylim([ymin, ymax + 0.05 * (ymax - ymin)])
80
+ # Calculate the maximum number of lags based on the size of the dataset
81
+ max_lags = min(40, len(series) // 2 - 1)
82
+
83
+ # Calculate ACF and PACF values
84
+ acf_values = acf(series, nlags=max_lags)
85
+ pacf_values = pacf(series, nlags=max_lags)
86
+
87
+ # Create ACF plot using Plotly
88
+ acf_fig = go.Figure()
89
+ acf_fig.add_trace(go.Bar(x=list(range(len(acf_values))), y=acf_values))
90
+ acf_fig.update_layout(
91
+ title=f"ACF for {col}",
92
+ xaxis_title="Lag",
93
+ yaxis_title="ACF",
94
+ font=dict(size=18),
95
+ )
92
96
 
93
- ymin, ymax = ax2.get_ylim()
94
- ax2.set_ylim([ymin, ymax + 0.05 * (ymax - ymin)])
97
+ # Create PACF plot using Plotly
98
+ pacf_fig = go.Figure()
99
+ pacf_fig.add_trace(go.Bar(x=list(range(len(pacf_values))), y=pacf_values))
100
+ pacf_fig.update_layout(
101
+ title=f"PACF for {col}",
102
+ xaxis_title="Lag",
103
+ yaxis_title="PACF",
104
+ font=dict(size=18),
105
+ )
95
106
 
96
- ax1.tick_params(axis="both", labelsize=18)
97
- ax2.tick_params(axis="both", labelsize=18)
98
- ax1.set_title(f"ACF for {col}", weight="bold", fontsize=20)
99
- ax2.set_title(f"PACF for {col}", weight="bold", fontsize=20)
100
- ax1.set_xlabel("Lag", fontsize=18)
101
- ax2.set_xlabel("Lag", fontsize=18)
102
107
  figures.append(
103
108
  Figure(
104
109
  for_object=self,
105
- key=f"{self.key}:{col}",
106
- figure=fig,
110
+ key=f"{self.key}:{col}_acf",
111
+ figure=acf_fig,
112
+ )
113
+ )
114
+ figures.append(
115
+ Figure(
116
+ for_object=self,
117
+ key=f"{self.key}:{col}_pacf",
118
+ figure=pacf_fig,
107
119
  )
108
120
  )
109
-
110
- # Do this if you want to prevent the figure from being displayed
111
- plt.close("all")
112
121
 
113
122
  return self.cache_results(figures=figures)
@@ -2,12 +2,18 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from pandas import DataFrame
5
+ from dataclasses import dataclass
6
+
7
+ import pandas as pd
6
8
  from statsmodels.tsa.stattools import adfuller
7
9
 
10
+ from validmind.logging import get_logger
8
11
  from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
9
12
 
13
+ logger = get_logger(__name__)
14
+
10
15
 
16
+ @dataclass
11
17
  class ADF(Metric):
12
18
  """
13
19
  Assesses the stationarity of a time series dataset using the Augmented Dickey-Fuller (ADF) test.
@@ -53,7 +59,7 @@ class ADF(Metric):
53
59
  }
54
60
 
55
61
  def summary(self, metric_value: dict):
56
- table = DataFrame.from_dict(metric_value, orient="index")
62
+ table = pd.DataFrame.from_dict(metric_value, orient="index")
57
63
  table = table.reset_index()
58
64
  table.columns = [
59
65
  "Feature",
@@ -83,18 +89,41 @@ class ADF(Metric):
83
89
  """
84
90
  dataset = self.inputs.dataset.df
85
91
 
92
+ # Check if the dataset is a time series
93
+ if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
94
+ raise ValueError(
95
+ "Dataset index must be a datetime or period index for time series analysis."
96
+ )
97
+
98
+ # Preprocessing: Drop rows with any NaN values
99
+ if dataset.isnull().values.any():
100
+ logger.warning(
101
+ "Dataset contains missing values. Rows with NaNs will be dropped."
102
+ )
103
+ dataset = dataset.dropna()
104
+
86
105
  adf_values = {}
87
106
  for col in dataset.columns:
88
- adf, pvalue, usedlag, nobs, critical_values, icbest = adfuller(
89
- dataset[col].values
90
- )
91
- adf_values[col] = {
92
- "stat": adf,
93
- "pvalue": pvalue,
94
- "usedlag": usedlag,
95
- "nobs": nobs,
96
- "critical_values": critical_values,
97
- "icbest": icbest,
98
- }
107
+ try:
108
+ adf_result = adfuller(dataset[col].values)
109
+ adf_values[col] = {
110
+ "ADF Statistic": adf_result[0],
111
+ "P-Value": adf_result[1],
112
+ "Used Lag": adf_result[2],
113
+ "Number of Observations": adf_result[3],
114
+ "Critical Values": adf_result[4],
115
+ "IC Best": adf_result[5],
116
+ }
117
+ except Exception as e:
118
+ logger.error(f"Error processing column '{col}': {e}")
119
+ adf_values[col] = {
120
+ "ADF Statistic": None,
121
+ "P-Value": None,
122
+ "Used Lag": None,
123
+ "Number of Observations": None,
124
+ "Critical Values": None,
125
+ "IC Best": None,
126
+ "Error": str(e),
127
+ }
99
128
 
100
129
  return self.cache_results(adf_values)
@@ -2,10 +2,10 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
+ import itertools
5
6
  from dataclasses import dataclass
6
7
 
7
- import matplotlib.pyplot as plt
8
- import seaborn as sns
8
+ import plotly.express as px
9
9
 
10
10
  from validmind.vm_models import Figure, Metric
11
11
 
@@ -23,7 +23,7 @@ class BivariateScatterPlots(Metric):
23
23
  biases and irregularities in the data.
24
24
 
25
25
  **Test Mechanism**: This metric operates by creating a scatter plot for each pair of the selected features in the
26
- dataset. If the parameters "features_pairs" are not specified, an error will be thrown. The metric offers
26
+ dataset. If the parameters "selected_columns" are not specified, an error will be thrown. The metric offers
27
27
  flexibility by allowing the user to filter on a specific target class - specified by the "target_filter" parameter
28
28
  - for more granified insights. Each scatterplot is then color-coded based on the category of the target variable
29
29
  for better visual differentiation. The seaborn scatterplot library is used for generating the plots.
@@ -53,7 +53,7 @@ class BivariateScatterPlots(Metric):
53
53
 
54
54
  name = "bivariate_scatter_plots"
55
55
  required_inputs = ["dataset"]
56
- default_params = {"features_pairs": None, "target_filter": None}
56
+ default_params = {"selected_columns": None}
57
57
  metadata = {
58
58
  "task_types": ["classification"],
59
59
  "tags": [
@@ -65,52 +65,49 @@ class BivariateScatterPlots(Metric):
65
65
  ],
66
66
  }
67
67
 
68
- def plot_bivariate_scatter(self, features_pairs, target_filter):
69
- status_var = self.inputs.dataset.target_column
68
+ def plot_bivariate_scatter(self, columns):
70
69
  figures = []
71
- for x, y in features_pairs.items():
72
- df = self.inputs.dataset.df
73
- if target_filter is not None:
74
- df = df[df[status_var] == target_filter]
75
-
76
- plt.figure()
77
-
78
- # Scatterplot using seaborn, with color variation based on 'status_var'
79
- # Create color mapping with rgba values, last value is alpha (transparency)
80
- palette = {0: (0.8, 0.8, 0.8, 0.8), 1: "tab:red"}
81
- plot = sns.scatterplot(
82
- data=df, x=x, y=y, hue=status_var, palette=palette, alpha=1
70
+ df = self.inputs.dataset.df
71
+
72
+ # Generate all pairs of columns
73
+ features_pairs = list(itertools.combinations(columns, 2))
74
+
75
+ for x, y in features_pairs:
76
+ fig = px.scatter(
77
+ df,
78
+ x=x,
79
+ y=y,
80
+ title=f"{x} and {y}",
81
+ labels={x: x, y: y},
82
+ opacity=0.7,
83
+ color_discrete_sequence=["blue"], # Use the same color for all points
83
84
  )
84
-
85
- # Change legend labels
86
- legend_labels = [
87
- "Category 1" if t.get_text() == "1" else "Category 2"
88
- for t in plot.legend_.texts[1:]
89
- ]
90
- plot.legend_.texts[1:] = legend_labels
91
-
92
- plt.title(x + " and " + y)
93
- plt.xlabel(x)
94
- plt.ylabel(y)
95
- plt.show()
85
+ fig.update_traces(marker=dict(color="blue"))
96
86
 
97
87
  figures.append(
98
- Figure(for_object=self, key=f"{self.key}:{x}_{y}", figure=plt.figure())
88
+ Figure(for_object=self, key=f"{self.key}:{x}_{y}", figure=fig)
99
89
  )
100
90
 
101
- plt.close("all")
102
-
103
91
  return figures
104
92
 
105
93
  def run(self):
106
- features_pairs = self.params["features_pairs"]
107
- target_filter = self.params["target_filter"]
108
-
109
- if features_pairs is None:
110
- raise ValueError(
111
- "The features_pairs parameter is required for this metric."
112
- )
94
+ selected_columns = self.params["selected_columns"]
95
+
96
+ if selected_columns is None:
97
+ # Use all columns if selected_columns is not provided
98
+ selected_columns = self.inputs.dataset.df.columns.tolist()
99
+ else:
100
+ # Check if all selected columns exist in the dataframe
101
+ missing_columns = [
102
+ col
103
+ for col in selected_columns
104
+ if col not in self.inputs.dataset.df.columns
105
+ ]
106
+ if missing_columns:
107
+ raise ValueError(
108
+ f"The following selected columns are not in the dataframe: {missing_columns}"
109
+ )
113
110
 
114
- figures = self.plot_bivariate_scatter(features_pairs, target_filter)
111
+ figures = self.plot_bivariate_scatter(selected_columns)
115
112
 
116
113
  return self.cache_results(figures=figures)
@@ -4,9 +4,14 @@
4
4
 
5
5
  from dataclasses import dataclass
6
6
 
7
+ import pandas as pd
7
8
  from arch.unitroot import DFGLS
9
+ from numpy.linalg import LinAlgError
8
10
 
9
- from validmind.vm_models import Metric
11
+ from validmind.logging import get_logger
12
+ from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
13
+
14
+ logger = get_logger(__name__)
10
15
 
11
16
 
12
17
  @dataclass
@@ -59,14 +64,65 @@ class DFGLSArch(Metric):
59
64
  """
60
65
  dataset = self.inputs.dataset.df
61
66
 
62
- dfgls_values = {}
67
+ # Check if the dataset is a time series
68
+ if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
69
+ raise ValueError(
70
+ "Dataset index must be a datetime or period index for time series analysis."
71
+ )
72
+
73
+ # Preprocessing: Drop rows with any NaN values
74
+ if dataset.isnull().values.any():
75
+ logger.warning(
76
+ "Dataset contains missing values. Rows with NaNs will be dropped."
77
+ )
78
+ dataset = dataset.dropna()
79
+
80
+ # Convert to numeric and handle non-numeric data
81
+ dataset = dataset.apply(pd.to_numeric, errors="coerce")
82
+
83
+ # Initialize a list to store DFGLS results
84
+ dfgls_values = []
85
+
63
86
  for col in dataset.columns:
64
- dfgls_out = DFGLS(dataset[col].values)
65
- dfgls_values[col] = {
66
- "stat": dfgls_out.stat,
67
- "pvalue": dfgls_out.pvalue,
68
- "usedlag": dfgls_out.lags,
69
- "nobs": dfgls_out.nobs,
70
- }
71
-
72
- return self.cache_results(dfgls_values)
87
+ try:
88
+ dfgls_out = DFGLS(dataset[col].values)
89
+ dfgls_values.append(
90
+ {
91
+ "Variable": col,
92
+ "stat": dfgls_out.stat,
93
+ "pvalue": dfgls_out.pvalue,
94
+ "usedlag": dfgls_out.lags,
95
+ "nobs": dfgls_out.nobs,
96
+ }
97
+ )
98
+ except LinAlgError as e:
99
+ logger.error(
100
+ f"SVD did not converge while processing column '{col}'. This could be due to numerical instability or multicollinearity. Error details: {e}"
101
+ )
102
+ dfgls_values.append(
103
+ {
104
+ "Variable": col,
105
+ "stat": None,
106
+ "pvalue": None,
107
+ "usedlag": None,
108
+ "nobs": None,
109
+ "error": str(e),
110
+ }
111
+ )
112
+
113
+ return self.cache_results({"dfgls_results": dfgls_values})
114
+
115
+ def summary(self, metric_value):
116
+ """
117
+ Build a table for summarizing the DFGLS results
118
+ """
119
+ dfgls_results = metric_value["dfgls_results"]
120
+
121
+ return ResultSummary(
122
+ results=[
123
+ ResultTable(
124
+ data=dfgls_results,
125
+ metadata=ResultTableMetadata(title="DFGLS Test Results"),
126
+ )
127
+ ]
128
+ )
@@ -62,7 +62,7 @@ class HeatmapFeatureCorrelations(Metric):
62
62
  }
63
63
 
64
64
  def run(self):
65
- features = self.params["features"]
65
+ features = self.params.get("features")
66
66
  declutter = self.params.get("declutter", False)
67
67
  fontsize = self.params.get("fontsize", 13)
68
68
 
@@ -65,9 +65,18 @@ class HighPearsonCorrelation(ThresholdTest):
65
65
  }
66
66
 
67
67
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
68
- """
69
- The high pearson correlation test returns results like these:
70
- [{"values": {"correlations": [{"column": "NumOfProducts", "correlation": -0.3044645622389459}]}, "column": "Balance", "passed": false}]
68
+ """The high pearson correlation test returns results like these:
69
+ [
70
+ {
71
+ "values": {
72
+ "correlations": [
73
+ {"column": "NumOfProducts", "correlation": -0.3044645622389459}
74
+ ]
75
+ },
76
+ "column": "Balance",
77
+ "passed": false,
78
+ }
79
+ ]
71
80
  """
72
81
  results_table = [
73
82
  {
@@ -64,7 +64,7 @@ class IsolationForestOutliers(Metric):
64
64
 
65
65
  def run(self):
66
66
  if self.params["features_columns"] is None:
67
- features_list = self.inputs.dataset.feature_columns
67
+ features_list = self.inputs.dataset.feature_columns_numeric
68
68
  else:
69
69
  features_list = self.params["features_columns"]
70
70
 
@@ -78,7 +78,7 @@ class IsolationForestOutliers(Metric):
78
78
  + "training dataset feature columns"
79
79
  )
80
80
 
81
- dataset = self.inputs.dataset.df
81
+ dataset = self.inputs.dataset.df[features_list]
82
82
 
83
83
  # Training with isolation forest algorithm
84
84
  clf = IsolationForest(