validmind 2.2.6__py3-none-any.whl → 2.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. validmind/__init__.py +2 -1
  2. validmind/__version__.py +1 -1
  3. validmind/{ai.py → ai/test_descriptions.py} +74 -82
  4. validmind/ai/utils.py +104 -0
  5. validmind/api_client.py +58 -19
  6. validmind/client.py +5 -5
  7. validmind/models/foundation.py +10 -6
  8. validmind/models/function.py +3 -1
  9. validmind/models/metadata.py +1 -1
  10. validmind/test_suites/__init__.py +1 -9
  11. validmind/test_suites/regression.py +0 -16
  12. validmind/test_suites/statsmodels_timeseries.py +1 -1
  13. validmind/tests/__init__.py +7 -7
  14. validmind/tests/__types__.py +170 -0
  15. validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
  16. validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
  17. validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
  18. validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
  19. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
  20. validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
  21. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  22. validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
  23. validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
  24. validmind/tests/data_validation/ScatterPlot.py +1 -1
  25. validmind/tests/data_validation/SeasonalDecompose.py +12 -7
  26. validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
  27. validmind/tests/data_validation/WOEBinPlots.py +1 -1
  28. validmind/tests/data_validation/WOEBinTable.py +1 -1
  29. validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
  30. validmind/tests/data_validation/nlp/CommonWords.py +1 -1
  31. validmind/tests/data_validation/nlp/Hashtags.py +1 -1
  32. validmind/tests/data_validation/nlp/Mentions.py +1 -1
  33. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
  34. validmind/tests/data_validation/nlp/Punctuations.py +1 -1
  35. validmind/tests/data_validation/nlp/Sentiment.py +1 -1
  36. validmind/tests/data_validation/nlp/TextDescription.py +5 -1
  37. validmind/tests/data_validation/nlp/Toxicity.py +1 -1
  38. validmind/tests/decorator.py +13 -1
  39. validmind/tests/model_validation/FeaturesAUC.py +5 -3
  40. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
  41. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
  42. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
  43. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
  44. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
  45. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
  46. validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
  47. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  48. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  49. validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
  50. validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
  51. validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
  52. validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
  53. validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
  54. validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
  55. validmind/tests/model_validation/ragas/utils.py +35 -9
  56. validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
  57. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
  58. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
  59. validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
  60. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
  61. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  62. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
  63. validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
  64. validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
  65. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
  66. validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
  67. validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
  68. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
  69. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
  70. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
  71. validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
  72. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
  73. validmind/tests/prompt_validation/Bias.py +14 -11
  74. validmind/tests/prompt_validation/Clarity.py +14 -11
  75. validmind/tests/prompt_validation/Conciseness.py +14 -11
  76. validmind/tests/prompt_validation/Delimitation.py +14 -11
  77. validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
  78. validmind/tests/prompt_validation/Robustness.py +11 -11
  79. validmind/tests/prompt_validation/Specificity.py +14 -11
  80. validmind/tests/prompt_validation/ai_powered_test.py +53 -75
  81. validmind/unit_metrics/composite.py +2 -1
  82. validmind/utils.py +34 -59
  83. validmind/vm_models/dataset/dataset.py +17 -3
  84. validmind/vm_models/dataset/utils.py +2 -2
  85. validmind/vm_models/model.py +1 -1
  86. validmind/vm_models/test/metric.py +1 -8
  87. validmind/vm_models/test/result_wrapper.py +2 -2
  88. validmind/vm_models/test/test.py +3 -0
  89. validmind/vm_models/test/threshold_test.py +1 -1
  90. validmind/vm_models/test_suite/runner.py +7 -4
  91. {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/METADATA +1 -1
  92. {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/RECORD +95 -103
  93. validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
  94. validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
  95. validmind/tests/data_validation/PiTPDHistogram.py +0 -152
  96. validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
  97. validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
  98. validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
  99. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
  100. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
  101. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
  102. validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
  103. {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/LICENSE +0 -0
  104. {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/WHEEL +0 -0
  105. {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/entry_points.txt +0 -0
@@ -30,7 +30,9 @@ from ..utils import (
30
30
  test_id_to_name,
31
31
  )
32
32
  from ..vm_models import TestContext, TestInput
33
- from .decorator import metric, tags, tasks
33
+ from .__types__ import TestID
34
+ from .decorator import tags, tasks
35
+ from .decorator import test as test_decorator
34
36
  from .test_providers import LocalTestProvider, TestProvider
35
37
 
36
38
  logger = get_logger(__name__)
@@ -84,7 +86,6 @@ def _pretty_list_tests(tests, truncate=True):
84
86
  {
85
87
  "ID": test_id,
86
88
  "Name": test_id_to_name(test_id),
87
- "Test Type": __test_classes[test_id].test_type,
88
89
  "Description": _test_description(__test_classes[test_id], truncate),
89
90
  "Required Inputs": __test_classes[test_id].required_inputs,
90
91
  "Params": __test_classes[test_id].default_params or {},
@@ -340,7 +341,7 @@ def load_test(test_id: str, reload=False):
340
341
  # if its a function, we decorate it and then load the class
341
342
  # TODO: simplify this as we move towards all functional metrics
342
343
  # "_" is used here so it doesn't conflict with other test ids
343
- metric("_")(test)
344
+ test_decorator("_")(test)
344
345
  test = __custom_tests["_"]
345
346
 
346
347
  test.test_id = f"{test_id}:{result_id}" if result_id else test_id
@@ -348,7 +349,7 @@ def load_test(test_id: str, reload=False):
348
349
  return test
349
350
 
350
351
 
351
- def describe_test(test_id: str = None, raw: bool = False, show: bool = True):
352
+ def describe_test(test_id: TestID = None, raw: bool = False, show: bool = True):
352
353
  """Get or show details about the test
353
354
 
354
355
  This function can be used to see test details including the test name, description,
@@ -365,7 +366,6 @@ def describe_test(test_id: str = None, raw: bool = False, show: bool = True):
365
366
  details = {
366
367
  "ID": test_id,
367
368
  "Name": test_id_to_name(test_id),
368
- "Test Type": test.test_type,
369
369
  "Required Inputs": test.required_inputs,
370
370
  "Params": test.default_params or {},
371
371
  "Description": inspect.getdoc(test).strip() or "",
@@ -407,7 +407,7 @@ def describe_test(test_id: str = None, raw: bool = False, show: bool = True):
407
407
 
408
408
 
409
409
  def run_test(
410
- test_id: str = None,
410
+ test_id: TestID = None,
411
411
  name: str = None,
412
412
  unit_metrics: list = None,
413
413
  params: dict = None,
@@ -451,7 +451,7 @@ def run_test(
451
451
 
452
452
  if unit_metrics:
453
453
  metric_id_name = "".join(word[0].upper() + word[1:] for word in name.split())
454
- test_id = f"validmind.composite_metric.{metric_id_name}"
454
+ test_id = f"validmind.composite_test.{metric_id_name}"
455
455
 
456
456
  error, TestClass = load_composite_metric(
457
457
  unit_metrics=unit_metrics, metric_name=metric_id_name
@@ -0,0 +1,170 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ """Literal types for test IDs.
6
+
7
+ This module is auto-generated by running `make generate-test-id-types`.
8
+ Should not be modified manually.
9
+ """
10
+
11
+ from typing import Literal
12
+
13
+ TestID = Literal[
14
+ "validmind.prompt_validation.Bias",
15
+ "validmind.prompt_validation.Clarity",
16
+ "validmind.prompt_validation.Specificity",
17
+ "validmind.prompt_validation.Robustness",
18
+ "validmind.prompt_validation.NegativeInstruction",
19
+ "validmind.prompt_validation.Conciseness",
20
+ "validmind.prompt_validation.Delimitation",
21
+ "validmind.model_validation.BertScore",
22
+ "validmind.model_validation.RegardScore",
23
+ "validmind.model_validation.BleuScore",
24
+ "validmind.model_validation.RegressionResidualsPlot",
25
+ "validmind.model_validation.FeaturesAUC",
26
+ "validmind.model_validation.ContextualRecall",
27
+ "validmind.model_validation.MeteorScore",
28
+ "validmind.model_validation.RougeScore",
29
+ "validmind.model_validation.ModelMetadata",
30
+ "validmind.model_validation.ClusterSizeDistribution",
31
+ "validmind.model_validation.TokenDisparity",
32
+ "validmind.model_validation.ToxicityScore",
33
+ "validmind.model_validation.embeddings.CosineSimilarityComparison",
34
+ "validmind.model_validation.embeddings.EmbeddingsVisualization2D",
35
+ "validmind.model_validation.embeddings.StabilityAnalysisRandomNoise",
36
+ "validmind.model_validation.embeddings.TSNEComponentsPairwisePlots",
37
+ "validmind.model_validation.embeddings.CosineSimilarityDistribution",
38
+ "validmind.model_validation.embeddings.PCAComponentsPairwisePlots",
39
+ "validmind.model_validation.embeddings.CosineSimilarityHeatmap",
40
+ "validmind.model_validation.embeddings.StabilityAnalysisTranslation",
41
+ "validmind.model_validation.embeddings.EuclideanDistanceComparison",
42
+ "validmind.model_validation.embeddings.ClusterDistribution",
43
+ "validmind.model_validation.embeddings.EuclideanDistanceHeatmap",
44
+ "validmind.model_validation.embeddings.StabilityAnalysis",
45
+ "validmind.model_validation.embeddings.StabilityAnalysisKeyword",
46
+ "validmind.model_validation.embeddings.StabilityAnalysisSynonyms",
47
+ "validmind.model_validation.embeddings.DescriptiveAnalytics",
48
+ "validmind.model_validation.ragas.ContextEntityRecall",
49
+ "validmind.model_validation.ragas.Faithfulness",
50
+ "validmind.model_validation.ragas.AspectCritique",
51
+ "validmind.model_validation.ragas.AnswerSimilarity",
52
+ "validmind.model_validation.ragas.AnswerCorrectness",
53
+ "validmind.model_validation.ragas.ContextRecall",
54
+ "validmind.model_validation.ragas.ContextRelevancy",
55
+ "validmind.model_validation.ragas.ContextPrecision",
56
+ "validmind.model_validation.ragas.AnswerRelevance",
57
+ "validmind.model_validation.sklearn.RegressionModelsPerformanceComparison",
58
+ "validmind.model_validation.sklearn.AdjustedMutualInformation",
59
+ "validmind.model_validation.sklearn.SilhouettePlot",
60
+ "validmind.model_validation.sklearn.RobustnessDiagnosis",
61
+ "validmind.model_validation.sklearn.AdjustedRandIndex",
62
+ "validmind.model_validation.sklearn.SHAPGlobalImportance",
63
+ "validmind.model_validation.sklearn.ConfusionMatrix",
64
+ "validmind.model_validation.sklearn.HomogeneityScore",
65
+ "validmind.model_validation.sklearn.CompletenessScore",
66
+ "validmind.model_validation.sklearn.OverfitDiagnosis",
67
+ "validmind.model_validation.sklearn.ClusterPerformanceMetrics",
68
+ "validmind.model_validation.sklearn.PermutationFeatureImportance",
69
+ "validmind.model_validation.sklearn.FowlkesMallowsScore",
70
+ "validmind.model_validation.sklearn.MinimumROCAUCScore",
71
+ "validmind.model_validation.sklearn.ClusterCosineSimilarity",
72
+ "validmind.model_validation.sklearn.PrecisionRecallCurve",
73
+ "validmind.model_validation.sklearn.ClassifierPerformance",
74
+ "validmind.model_validation.sklearn.VMeasure",
75
+ "validmind.model_validation.sklearn.MinimumF1Score",
76
+ "validmind.model_validation.sklearn.ROCCurve",
77
+ "validmind.model_validation.sklearn.RegressionR2Square",
78
+ "validmind.model_validation.sklearn.RegressionErrors",
79
+ "validmind.model_validation.sklearn.ClusterPerformance",
80
+ "validmind.model_validation.sklearn.TrainingTestDegradation",
81
+ "validmind.model_validation.sklearn.HyperParametersTuning",
82
+ "validmind.model_validation.sklearn.KMeansClustersOptimization",
83
+ "validmind.model_validation.sklearn.ModelsPerformanceComparison",
84
+ "validmind.model_validation.sklearn.WeakspotsDiagnosis",
85
+ "validmind.model_validation.sklearn.PopulationStabilityIndex",
86
+ "validmind.model_validation.sklearn.MinimumAccuracy",
87
+ "validmind.model_validation.statsmodels.RegressionModelsCoeffs",
88
+ "validmind.model_validation.statsmodels.BoxPierce",
89
+ "validmind.model_validation.statsmodels.RegressionCoeffsPlot",
90
+ "validmind.model_validation.statsmodels.RegressionModelSensitivityPlot",
91
+ "validmind.model_validation.statsmodels.RegressionModelForecastPlotLevels",
92
+ "validmind.model_validation.statsmodels.ScorecardHistogram",
93
+ "validmind.model_validation.statsmodels.LJungBox",
94
+ "validmind.model_validation.statsmodels.JarqueBera",
95
+ "validmind.model_validation.statsmodels.KolmogorovSmirnov",
96
+ "validmind.model_validation.statsmodels.ShapiroWilk",
97
+ "validmind.model_validation.statsmodels.CumulativePredictionProbabilities",
98
+ "validmind.model_validation.statsmodels.RegressionFeatureSignificance",
99
+ "validmind.model_validation.statsmodels.RegressionModelSummary",
100
+ "validmind.model_validation.statsmodels.Lilliefors",
101
+ "validmind.model_validation.statsmodels.RunsTest",
102
+ "validmind.model_validation.statsmodels.RegressionPermutationFeatureImportance",
103
+ "validmind.model_validation.statsmodels.PredictionProbabilitiesHistogram",
104
+ "validmind.model_validation.statsmodels.AutoARIMA",
105
+ "validmind.model_validation.statsmodels.GINITable",
106
+ "validmind.model_validation.statsmodels.RegressionModelForecastPlot",
107
+ "validmind.model_validation.statsmodels.DurbinWatsonTest",
108
+ "validmind.data_validation.MissingValuesRisk",
109
+ "validmind.data_validation.IQROutliersTable",
110
+ "validmind.data_validation.BivariateFeaturesBarPlots",
111
+ "validmind.data_validation.Skewness",
112
+ "validmind.data_validation.Duplicates",
113
+ "validmind.data_validation.MissingValuesBarPlot",
114
+ "validmind.data_validation.DatasetDescription",
115
+ "validmind.data_validation.ZivotAndrewsArch",
116
+ "validmind.data_validation.ScatterPlot",
117
+ "validmind.data_validation.TimeSeriesOutliers",
118
+ "validmind.data_validation.TabularCategoricalBarPlots",
119
+ "validmind.data_validation.AutoStationarity",
120
+ "validmind.data_validation.DescriptiveStatistics",
121
+ "validmind.data_validation.ANOVAOneWayTable",
122
+ "validmind.data_validation.TargetRateBarPlots",
123
+ "validmind.data_validation.PearsonCorrelationMatrix",
124
+ "validmind.data_validation.FeatureTargetCorrelationPlot",
125
+ "validmind.data_validation.TabularNumericalHistograms",
126
+ "validmind.data_validation.IsolationForestOutliers",
127
+ "validmind.data_validation.ChiSquaredFeaturesTable",
128
+ "validmind.data_validation.HighCardinality",
129
+ "validmind.data_validation.MissingValues",
130
+ "validmind.data_validation.PhillipsPerronArch",
131
+ "validmind.data_validation.RollingStatsPlot",
132
+ "validmind.data_validation.TabularDescriptionTables",
133
+ "validmind.data_validation.AutoMA",
134
+ "validmind.data_validation.UniqueRows",
135
+ "validmind.data_validation.TooManyZeroValues",
136
+ "validmind.data_validation.HighPearsonCorrelation",
137
+ "validmind.data_validation.ACFandPACFPlot",
138
+ "validmind.data_validation.BivariateHistograms",
139
+ "validmind.data_validation.WOEBinTable",
140
+ "validmind.data_validation.HeatmapFeatureCorrelations",
141
+ "validmind.data_validation.TimeSeriesFrequency",
142
+ "validmind.data_validation.DatasetSplit",
143
+ "validmind.data_validation.SpreadPlot",
144
+ "validmind.data_validation.TimeSeriesLinePlot",
145
+ "validmind.data_validation.KPSS",
146
+ "validmind.data_validation.AutoSeasonality",
147
+ "validmind.data_validation.BivariateScatterPlots",
148
+ "validmind.data_validation.EngleGrangerCoint",
149
+ "validmind.data_validation.TimeSeriesMissingValues",
150
+ "validmind.data_validation.TimeSeriesHistogram",
151
+ "validmind.data_validation.LaggedCorrelationHeatmap",
152
+ "validmind.data_validation.SeasonalDecompose",
153
+ "validmind.data_validation.WOEBinPlots",
154
+ "validmind.data_validation.ClassImbalance",
155
+ "validmind.data_validation.IQROutliersBarPlot",
156
+ "validmind.data_validation.DFGLSArch",
157
+ "validmind.data_validation.AutoAR",
158
+ "validmind.data_validation.TabularDateTimeHistograms",
159
+ "validmind.data_validation.ADF",
160
+ "validmind.data_validation.nlp.Toxicity",
161
+ "validmind.data_validation.nlp.PolarityAndSubjectivity",
162
+ "validmind.data_validation.nlp.Punctuations",
163
+ "validmind.data_validation.nlp.Sentiment",
164
+ "validmind.data_validation.nlp.CommonWords",
165
+ "validmind.data_validation.nlp.Hashtags",
166
+ "validmind.data_validation.nlp.LanguageDetection",
167
+ "validmind.data_validation.nlp.Mentions",
168
+ "validmind.data_validation.nlp.TextDescription",
169
+ "validmind.data_validation.nlp.StopWords",
170
+ ]
@@ -2,9 +2,9 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import matplotlib.pyplot as plt
6
5
  import pandas as pd
7
- from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
6
+ import plotly.graph_objects as go
7
+ from statsmodels.tsa.stattools import acf, pacf
8
8
 
9
9
  from validmind.vm_models import Figure, Metric
10
10
 
@@ -77,37 +77,46 @@ class ACFandPACFPlot(Metric):
77
77
  for col in df.columns:
78
78
  series = df[col]
79
79
 
80
- # Create subplots
81
- fig, (ax1, ax2) = plt.subplots(1, 2)
82
- width, _ = fig.get_size_inches()
83
- fig.set_size_inches(width, 5)
84
-
85
- plot_acf(series, ax=ax1)
86
- plot_pacf(series, ax=ax2)
87
-
88
- # Get the current y-axis limits
89
- ymin, ymax = ax1.get_ylim()
90
- # Set new limits - adding a bit of space
91
- ax1.set_ylim([ymin, ymax + 0.05 * (ymax - ymin)])
80
+ # Calculate the maximum number of lags based on the size of the dataset
81
+ max_lags = min(40, len(series) // 2 - 1)
82
+
83
+ # Calculate ACF and PACF values
84
+ acf_values = acf(series, nlags=max_lags)
85
+ pacf_values = pacf(series, nlags=max_lags)
86
+
87
+ # Create ACF plot using Plotly
88
+ acf_fig = go.Figure()
89
+ acf_fig.add_trace(go.Bar(x=list(range(len(acf_values))), y=acf_values))
90
+ acf_fig.update_layout(
91
+ title=f"ACF for {col}",
92
+ xaxis_title="Lag",
93
+ yaxis_title="ACF",
94
+ font=dict(size=18),
95
+ )
92
96
 
93
- ymin, ymax = ax2.get_ylim()
94
- ax2.set_ylim([ymin, ymax + 0.05 * (ymax - ymin)])
97
+ # Create PACF plot using Plotly
98
+ pacf_fig = go.Figure()
99
+ pacf_fig.add_trace(go.Bar(x=list(range(len(pacf_values))), y=pacf_values))
100
+ pacf_fig.update_layout(
101
+ title=f"PACF for {col}",
102
+ xaxis_title="Lag",
103
+ yaxis_title="PACF",
104
+ font=dict(size=18),
105
+ )
95
106
 
96
- ax1.tick_params(axis="both", labelsize=18)
97
- ax2.tick_params(axis="both", labelsize=18)
98
- ax1.set_title(f"ACF for {col}", weight="bold", fontsize=20)
99
- ax2.set_title(f"PACF for {col}", weight="bold", fontsize=20)
100
- ax1.set_xlabel("Lag", fontsize=18)
101
- ax2.set_xlabel("Lag", fontsize=18)
102
107
  figures.append(
103
108
  Figure(
104
109
  for_object=self,
105
- key=f"{self.key}:{col}",
106
- figure=fig,
110
+ key=f"{self.key}:{col}_acf",
111
+ figure=acf_fig,
112
+ )
113
+ )
114
+ figures.append(
115
+ Figure(
116
+ for_object=self,
117
+ key=f"{self.key}:{col}_pacf",
118
+ figure=pacf_fig,
107
119
  )
108
120
  )
109
-
110
- # Do this if you want to prevent the figure from being displayed
111
- plt.close("all")
112
121
 
113
122
  return self.cache_results(figures=figures)
@@ -2,12 +2,18 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from pandas import DataFrame
5
+ from dataclasses import dataclass
6
+
7
+ import pandas as pd
6
8
  from statsmodels.tsa.stattools import adfuller
7
9
 
10
+ from validmind.logging import get_logger
8
11
  from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
9
12
 
13
+ logger = get_logger(__name__)
14
+
10
15
 
16
+ @dataclass
11
17
  class ADF(Metric):
12
18
  """
13
19
  Assesses the stationarity of a time series dataset using the Augmented Dickey-Fuller (ADF) test.
@@ -53,7 +59,7 @@ class ADF(Metric):
53
59
  }
54
60
 
55
61
  def summary(self, metric_value: dict):
56
- table = DataFrame.from_dict(metric_value, orient="index")
62
+ table = pd.DataFrame.from_dict(metric_value, orient="index")
57
63
  table = table.reset_index()
58
64
  table.columns = [
59
65
  "Feature",
@@ -83,18 +89,41 @@ class ADF(Metric):
83
89
  """
84
90
  dataset = self.inputs.dataset.df
85
91
 
92
+ # Check if the dataset is a time series
93
+ if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
94
+ raise ValueError(
95
+ "Dataset index must be a datetime or period index for time series analysis."
96
+ )
97
+
98
+ # Preprocessing: Drop rows with any NaN values
99
+ if dataset.isnull().values.any():
100
+ logger.warning(
101
+ "Dataset contains missing values. Rows with NaNs will be dropped."
102
+ )
103
+ dataset = dataset.dropna()
104
+
86
105
  adf_values = {}
87
106
  for col in dataset.columns:
88
- adf, pvalue, usedlag, nobs, critical_values, icbest = adfuller(
89
- dataset[col].values
90
- )
91
- adf_values[col] = {
92
- "stat": adf,
93
- "pvalue": pvalue,
94
- "usedlag": usedlag,
95
- "nobs": nobs,
96
- "critical_values": critical_values,
97
- "icbest": icbest,
98
- }
107
+ try:
108
+ adf_result = adfuller(dataset[col].values)
109
+ adf_values[col] = {
110
+ "ADF Statistic": adf_result[0],
111
+ "P-Value": adf_result[1],
112
+ "Used Lag": adf_result[2],
113
+ "Number of Observations": adf_result[3],
114
+ "Critical Values": adf_result[4],
115
+ "IC Best": adf_result[5],
116
+ }
117
+ except Exception as e:
118
+ logger.error(f"Error processing column '{col}': {e}")
119
+ adf_values[col] = {
120
+ "ADF Statistic": None,
121
+ "P-Value": None,
122
+ "Used Lag": None,
123
+ "Number of Observations": None,
124
+ "Critical Values": None,
125
+ "IC Best": None,
126
+ "Error": str(e),
127
+ }
99
128
 
100
129
  return self.cache_results(adf_values)
@@ -2,10 +2,10 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
+ import itertools
5
6
  from dataclasses import dataclass
6
7
 
7
- import matplotlib.pyplot as plt
8
- import seaborn as sns
8
+ import plotly.express as px
9
9
 
10
10
  from validmind.vm_models import Figure, Metric
11
11
 
@@ -23,7 +23,7 @@ class BivariateScatterPlots(Metric):
23
23
  biases and irregularities in the data.
24
24
 
25
25
  **Test Mechanism**: This metric operates by creating a scatter plot for each pair of the selected features in the
26
- dataset. If the parameters "features_pairs" are not specified, an error will be thrown. The metric offers
26
+ dataset. If the parameters "selected_columns" are not specified, an error will be thrown. The metric offers
27
27
  flexibility by allowing the user to filter on a specific target class - specified by the "target_filter" parameter
28
28
  - for more granified insights. Each scatterplot is then color-coded based on the category of the target variable
29
29
  for better visual differentiation. The seaborn scatterplot library is used for generating the plots.
@@ -53,7 +53,7 @@ class BivariateScatterPlots(Metric):
53
53
 
54
54
  name = "bivariate_scatter_plots"
55
55
  required_inputs = ["dataset"]
56
- default_params = {"features_pairs": None, "target_filter": None}
56
+ default_params = {"selected_columns": None}
57
57
  metadata = {
58
58
  "task_types": ["classification"],
59
59
  "tags": [
@@ -65,52 +65,49 @@ class BivariateScatterPlots(Metric):
65
65
  ],
66
66
  }
67
67
 
68
- def plot_bivariate_scatter(self, features_pairs, target_filter):
69
- status_var = self.inputs.dataset.target_column
68
+ def plot_bivariate_scatter(self, columns):
70
69
  figures = []
71
- for x, y in features_pairs.items():
72
- df = self.inputs.dataset.df
73
- if target_filter is not None:
74
- df = df[df[status_var] == target_filter]
75
-
76
- plt.figure()
77
-
78
- # Scatterplot using seaborn, with color variation based on 'status_var'
79
- # Create color mapping with rgba values, last value is alpha (transparency)
80
- palette = {0: (0.8, 0.8, 0.8, 0.8), 1: "tab:red"}
81
- plot = sns.scatterplot(
82
- data=df, x=x, y=y, hue=status_var, palette=palette, alpha=1
70
+ df = self.inputs.dataset.df
71
+
72
+ # Generate all pairs of columns
73
+ features_pairs = list(itertools.combinations(columns, 2))
74
+
75
+ for x, y in features_pairs:
76
+ fig = px.scatter(
77
+ df,
78
+ x=x,
79
+ y=y,
80
+ title=f"{x} and {y}",
81
+ labels={x: x, y: y},
82
+ opacity=0.7,
83
+ color_discrete_sequence=["blue"], # Use the same color for all points
83
84
  )
84
-
85
- # Change legend labels
86
- legend_labels = [
87
- "Category 1" if t.get_text() == "1" else "Category 2"
88
- for t in plot.legend_.texts[1:]
89
- ]
90
- plot.legend_.texts[1:] = legend_labels
91
-
92
- plt.title(x + " and " + y)
93
- plt.xlabel(x)
94
- plt.ylabel(y)
95
- plt.show()
85
+ fig.update_traces(marker=dict(color="blue"))
96
86
 
97
87
  figures.append(
98
- Figure(for_object=self, key=f"{self.key}:{x}_{y}", figure=plt.figure())
88
+ Figure(for_object=self, key=f"{self.key}:{x}_{y}", figure=fig)
99
89
  )
100
90
 
101
- plt.close("all")
102
-
103
91
  return figures
104
92
 
105
93
  def run(self):
106
- features_pairs = self.params["features_pairs"]
107
- target_filter = self.params["target_filter"]
108
-
109
- if features_pairs is None:
110
- raise ValueError(
111
- "The features_pairs parameter is required for this metric."
112
- )
94
+ selected_columns = self.params["selected_columns"]
95
+
96
+ if selected_columns is None:
97
+ # Use all columns if selected_columns is not provided
98
+ selected_columns = self.inputs.dataset.df.columns.tolist()
99
+ else:
100
+ # Check if all selected columns exist in the dataframe
101
+ missing_columns = [
102
+ col
103
+ for col in selected_columns
104
+ if col not in self.inputs.dataset.df.columns
105
+ ]
106
+ if missing_columns:
107
+ raise ValueError(
108
+ f"The following selected columns are not in the dataframe: {missing_columns}"
109
+ )
113
110
 
114
- figures = self.plot_bivariate_scatter(features_pairs, target_filter)
111
+ figures = self.plot_bivariate_scatter(selected_columns)
115
112
 
116
113
  return self.cache_results(figures=figures)
@@ -4,9 +4,14 @@
4
4
 
5
5
  from dataclasses import dataclass
6
6
 
7
+ import pandas as pd
7
8
  from arch.unitroot import DFGLS
9
+ from numpy.linalg import LinAlgError
8
10
 
9
- from validmind.vm_models import Metric
11
+ from validmind.logging import get_logger
12
+ from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
13
+
14
+ logger = get_logger(__name__)
10
15
 
11
16
 
12
17
  @dataclass
@@ -59,14 +64,65 @@ class DFGLSArch(Metric):
59
64
  """
60
65
  dataset = self.inputs.dataset.df
61
66
 
62
- dfgls_values = {}
67
+ # Check if the dataset is a time series
68
+ if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
69
+ raise ValueError(
70
+ "Dataset index must be a datetime or period index for time series analysis."
71
+ )
72
+
73
+ # Preprocessing: Drop rows with any NaN values
74
+ if dataset.isnull().values.any():
75
+ logger.warning(
76
+ "Dataset contains missing values. Rows with NaNs will be dropped."
77
+ )
78
+ dataset = dataset.dropna()
79
+
80
+ # Convert to numeric and handle non-numeric data
81
+ dataset = dataset.apply(pd.to_numeric, errors="coerce")
82
+
83
+ # Initialize a list to store DFGLS results
84
+ dfgls_values = []
85
+
63
86
  for col in dataset.columns:
64
- dfgls_out = DFGLS(dataset[col].values)
65
- dfgls_values[col] = {
66
- "stat": dfgls_out.stat,
67
- "pvalue": dfgls_out.pvalue,
68
- "usedlag": dfgls_out.lags,
69
- "nobs": dfgls_out.nobs,
70
- }
71
-
72
- return self.cache_results(dfgls_values)
87
+ try:
88
+ dfgls_out = DFGLS(dataset[col].values)
89
+ dfgls_values.append(
90
+ {
91
+ "Variable": col,
92
+ "stat": dfgls_out.stat,
93
+ "pvalue": dfgls_out.pvalue,
94
+ "usedlag": dfgls_out.lags,
95
+ "nobs": dfgls_out.nobs,
96
+ }
97
+ )
98
+ except LinAlgError as e:
99
+ logger.error(
100
+ f"SVD did not converge while processing column '{col}'. This could be due to numerical instability or multicollinearity. Error details: {e}"
101
+ )
102
+ dfgls_values.append(
103
+ {
104
+ "Variable": col,
105
+ "stat": None,
106
+ "pvalue": None,
107
+ "usedlag": None,
108
+ "nobs": None,
109
+ "error": str(e),
110
+ }
111
+ )
112
+
113
+ return self.cache_results({"dfgls_results": dfgls_values})
114
+
115
+ def summary(self, metric_value):
116
+ """
117
+ Build a table for summarizing the DFGLS results
118
+ """
119
+ dfgls_results = metric_value["dfgls_results"]
120
+
121
+ return ResultSummary(
122
+ results=[
123
+ ResultTable(
124
+ data=dfgls_results,
125
+ metadata=ResultTableMetadata(title="DFGLS Test Results"),
126
+ )
127
+ ]
128
+ )
@@ -62,7 +62,7 @@ class HeatmapFeatureCorrelations(Metric):
62
62
  }
63
63
 
64
64
  def run(self):
65
- features = self.params["features"]
65
+ features = self.params.get("features")
66
66
  declutter = self.params.get("declutter", False)
67
67
  fontsize = self.params.get("fontsize", 13)
68
68
 
@@ -65,9 +65,18 @@ class HighPearsonCorrelation(ThresholdTest):
65
65
  }
66
66
 
67
67
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
68
- """
69
- The high pearson correlation test returns results like these:
70
- [{"values": {"correlations": [{"column": "NumOfProducts", "correlation": -0.3044645622389459}]}, "column": "Balance", "passed": false}]
68
+ """The high pearson correlation test returns results like these:
69
+ [
70
+ {
71
+ "values": {
72
+ "correlations": [
73
+ {"column": "NumOfProducts", "correlation": -0.3044645622389459}
74
+ ]
75
+ },
76
+ "column": "Balance",
77
+ "passed": false,
78
+ }
79
+ ]
71
80
  """
72
81
  results_table = [
73
82
  {
@@ -64,7 +64,7 @@ class IsolationForestOutliers(Metric):
64
64
 
65
65
  def run(self):
66
66
  if self.params["features_columns"] is None:
67
- features_list = self.inputs.dataset.feature_columns
67
+ features_list = self.inputs.dataset.feature_columns_numeric
68
68
  else:
69
69
  features_list = self.params["features_columns"]
70
70
 
@@ -78,7 +78,7 @@ class IsolationForestOutliers(Metric):
78
78
  + "training dataset feature columns"
79
79
  )
80
80
 
81
- dataset = self.inputs.dataset.df
81
+ dataset = self.inputs.dataset.df[features_list]
82
82
 
83
83
  # Training with isolation forest algorithm
84
84
  clf = IsolationForest(