validmind 2.2.6__py3-none-any.whl → 2.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +2 -1
- validmind/__version__.py +1 -1
- validmind/{ai.py → ai/test_descriptions.py} +74 -82
- validmind/ai/utils.py +104 -0
- validmind/api_client.py +58 -19
- validmind/client.py +5 -5
- validmind/models/foundation.py +10 -6
- validmind/models/function.py +3 -1
- validmind/models/metadata.py +1 -1
- validmind/test_suites/__init__.py +1 -9
- validmind/test_suites/regression.py +0 -16
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/tests/__init__.py +7 -7
- validmind/tests/__types__.py +170 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
- validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
- validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
- validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
- validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
- validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
- validmind/tests/data_validation/ScatterPlot.py +1 -1
- validmind/tests/data_validation/SeasonalDecompose.py +12 -7
- validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
- validmind/tests/data_validation/WOEBinPlots.py +1 -1
- validmind/tests/data_validation/WOEBinTable.py +1 -1
- validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
- validmind/tests/data_validation/nlp/CommonWords.py +1 -1
- validmind/tests/data_validation/nlp/Hashtags.py +1 -1
- validmind/tests/data_validation/nlp/Mentions.py +1 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
- validmind/tests/data_validation/nlp/Punctuations.py +1 -1
- validmind/tests/data_validation/nlp/Sentiment.py +1 -1
- validmind/tests/data_validation/nlp/TextDescription.py +5 -1
- validmind/tests/data_validation/nlp/Toxicity.py +1 -1
- validmind/tests/decorator.py +13 -1
- validmind/tests/model_validation/FeaturesAUC.py +5 -3
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
- validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
- validmind/tests/model_validation/ragas/utils.py +35 -9
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
- validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
- validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
- validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
- validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
- validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
- validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +14 -11
- validmind/tests/prompt_validation/Conciseness.py +14 -11
- validmind/tests/prompt_validation/Delimitation.py +14 -11
- validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
- validmind/tests/prompt_validation/Robustness.py +11 -11
- validmind/tests/prompt_validation/Specificity.py +14 -11
- validmind/tests/prompt_validation/ai_powered_test.py +53 -75
- validmind/unit_metrics/composite.py +2 -1
- validmind/utils.py +34 -59
- validmind/vm_models/dataset/dataset.py +17 -3
- validmind/vm_models/dataset/utils.py +2 -2
- validmind/vm_models/model.py +1 -1
- validmind/vm_models/test/metric.py +1 -8
- validmind/vm_models/test/result_wrapper.py +2 -2
- validmind/vm_models/test/test.py +3 -0
- validmind/vm_models/test/threshold_test.py +1 -1
- validmind/vm_models/test_suite/runner.py +7 -4
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/METADATA +1 -1
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/RECORD +95 -103
- validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
- validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
- validmind/tests/data_validation/PiTPDHistogram.py +0 -152
- validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
- validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
- validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
- validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/LICENSE +0 -0
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/WHEEL +0 -0
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/entry_points.txt +0 -0
validmind/tests/__init__.py
CHANGED
@@ -30,7 +30,9 @@ from ..utils import (
|
|
30
30
|
test_id_to_name,
|
31
31
|
)
|
32
32
|
from ..vm_models import TestContext, TestInput
|
33
|
-
from .
|
33
|
+
from .__types__ import TestID
|
34
|
+
from .decorator import tags, tasks
|
35
|
+
from .decorator import test as test_decorator
|
34
36
|
from .test_providers import LocalTestProvider, TestProvider
|
35
37
|
|
36
38
|
logger = get_logger(__name__)
|
@@ -84,7 +86,6 @@ def _pretty_list_tests(tests, truncate=True):
|
|
84
86
|
{
|
85
87
|
"ID": test_id,
|
86
88
|
"Name": test_id_to_name(test_id),
|
87
|
-
"Test Type": __test_classes[test_id].test_type,
|
88
89
|
"Description": _test_description(__test_classes[test_id], truncate),
|
89
90
|
"Required Inputs": __test_classes[test_id].required_inputs,
|
90
91
|
"Params": __test_classes[test_id].default_params or {},
|
@@ -340,7 +341,7 @@ def load_test(test_id: str, reload=False):
|
|
340
341
|
# if its a function, we decorate it and then load the class
|
341
342
|
# TODO: simplify this as we move towards all functional metrics
|
342
343
|
# "_" is used here so it doesn't conflict with other test ids
|
343
|
-
|
344
|
+
test_decorator("_")(test)
|
344
345
|
test = __custom_tests["_"]
|
345
346
|
|
346
347
|
test.test_id = f"{test_id}:{result_id}" if result_id else test_id
|
@@ -348,7 +349,7 @@ def load_test(test_id: str, reload=False):
|
|
348
349
|
return test
|
349
350
|
|
350
351
|
|
351
|
-
def describe_test(test_id:
|
352
|
+
def describe_test(test_id: TestID = None, raw: bool = False, show: bool = True):
|
352
353
|
"""Get or show details about the test
|
353
354
|
|
354
355
|
This function can be used to see test details including the test name, description,
|
@@ -365,7 +366,6 @@ def describe_test(test_id: str = None, raw: bool = False, show: bool = True):
|
|
365
366
|
details = {
|
366
367
|
"ID": test_id,
|
367
368
|
"Name": test_id_to_name(test_id),
|
368
|
-
"Test Type": test.test_type,
|
369
369
|
"Required Inputs": test.required_inputs,
|
370
370
|
"Params": test.default_params or {},
|
371
371
|
"Description": inspect.getdoc(test).strip() or "",
|
@@ -407,7 +407,7 @@ def describe_test(test_id: str = None, raw: bool = False, show: bool = True):
|
|
407
407
|
|
408
408
|
|
409
409
|
def run_test(
|
410
|
-
test_id:
|
410
|
+
test_id: TestID = None,
|
411
411
|
name: str = None,
|
412
412
|
unit_metrics: list = None,
|
413
413
|
params: dict = None,
|
@@ -451,7 +451,7 @@ def run_test(
|
|
451
451
|
|
452
452
|
if unit_metrics:
|
453
453
|
metric_id_name = "".join(word[0].upper() + word[1:] for word in name.split())
|
454
|
-
test_id = f"validmind.
|
454
|
+
test_id = f"validmind.composite_test.{metric_id_name}"
|
455
455
|
|
456
456
|
error, TestClass = load_composite_metric(
|
457
457
|
unit_metrics=unit_metrics, metric_name=metric_id_name
|
@@ -0,0 +1,170 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
"""Literal types for test IDs.
|
6
|
+
|
7
|
+
This module is auto-generated by running `make generate-test-id-types`.
|
8
|
+
Should not be modified manually.
|
9
|
+
"""
|
10
|
+
|
11
|
+
from typing import Literal
|
12
|
+
|
13
|
+
TestID = Literal[
|
14
|
+
"validmind.prompt_validation.Bias",
|
15
|
+
"validmind.prompt_validation.Clarity",
|
16
|
+
"validmind.prompt_validation.Specificity",
|
17
|
+
"validmind.prompt_validation.Robustness",
|
18
|
+
"validmind.prompt_validation.NegativeInstruction",
|
19
|
+
"validmind.prompt_validation.Conciseness",
|
20
|
+
"validmind.prompt_validation.Delimitation",
|
21
|
+
"validmind.model_validation.BertScore",
|
22
|
+
"validmind.model_validation.RegardScore",
|
23
|
+
"validmind.model_validation.BleuScore",
|
24
|
+
"validmind.model_validation.RegressionResidualsPlot",
|
25
|
+
"validmind.model_validation.FeaturesAUC",
|
26
|
+
"validmind.model_validation.ContextualRecall",
|
27
|
+
"validmind.model_validation.MeteorScore",
|
28
|
+
"validmind.model_validation.RougeScore",
|
29
|
+
"validmind.model_validation.ModelMetadata",
|
30
|
+
"validmind.model_validation.ClusterSizeDistribution",
|
31
|
+
"validmind.model_validation.TokenDisparity",
|
32
|
+
"validmind.model_validation.ToxicityScore",
|
33
|
+
"validmind.model_validation.embeddings.CosineSimilarityComparison",
|
34
|
+
"validmind.model_validation.embeddings.EmbeddingsVisualization2D",
|
35
|
+
"validmind.model_validation.embeddings.StabilityAnalysisRandomNoise",
|
36
|
+
"validmind.model_validation.embeddings.TSNEComponentsPairwisePlots",
|
37
|
+
"validmind.model_validation.embeddings.CosineSimilarityDistribution",
|
38
|
+
"validmind.model_validation.embeddings.PCAComponentsPairwisePlots",
|
39
|
+
"validmind.model_validation.embeddings.CosineSimilarityHeatmap",
|
40
|
+
"validmind.model_validation.embeddings.StabilityAnalysisTranslation",
|
41
|
+
"validmind.model_validation.embeddings.EuclideanDistanceComparison",
|
42
|
+
"validmind.model_validation.embeddings.ClusterDistribution",
|
43
|
+
"validmind.model_validation.embeddings.EuclideanDistanceHeatmap",
|
44
|
+
"validmind.model_validation.embeddings.StabilityAnalysis",
|
45
|
+
"validmind.model_validation.embeddings.StabilityAnalysisKeyword",
|
46
|
+
"validmind.model_validation.embeddings.StabilityAnalysisSynonyms",
|
47
|
+
"validmind.model_validation.embeddings.DescriptiveAnalytics",
|
48
|
+
"validmind.model_validation.ragas.ContextEntityRecall",
|
49
|
+
"validmind.model_validation.ragas.Faithfulness",
|
50
|
+
"validmind.model_validation.ragas.AspectCritique",
|
51
|
+
"validmind.model_validation.ragas.AnswerSimilarity",
|
52
|
+
"validmind.model_validation.ragas.AnswerCorrectness",
|
53
|
+
"validmind.model_validation.ragas.ContextRecall",
|
54
|
+
"validmind.model_validation.ragas.ContextRelevancy",
|
55
|
+
"validmind.model_validation.ragas.ContextPrecision",
|
56
|
+
"validmind.model_validation.ragas.AnswerRelevance",
|
57
|
+
"validmind.model_validation.sklearn.RegressionModelsPerformanceComparison",
|
58
|
+
"validmind.model_validation.sklearn.AdjustedMutualInformation",
|
59
|
+
"validmind.model_validation.sklearn.SilhouettePlot",
|
60
|
+
"validmind.model_validation.sklearn.RobustnessDiagnosis",
|
61
|
+
"validmind.model_validation.sklearn.AdjustedRandIndex",
|
62
|
+
"validmind.model_validation.sklearn.SHAPGlobalImportance",
|
63
|
+
"validmind.model_validation.sklearn.ConfusionMatrix",
|
64
|
+
"validmind.model_validation.sklearn.HomogeneityScore",
|
65
|
+
"validmind.model_validation.sklearn.CompletenessScore",
|
66
|
+
"validmind.model_validation.sklearn.OverfitDiagnosis",
|
67
|
+
"validmind.model_validation.sklearn.ClusterPerformanceMetrics",
|
68
|
+
"validmind.model_validation.sklearn.PermutationFeatureImportance",
|
69
|
+
"validmind.model_validation.sklearn.FowlkesMallowsScore",
|
70
|
+
"validmind.model_validation.sklearn.MinimumROCAUCScore",
|
71
|
+
"validmind.model_validation.sklearn.ClusterCosineSimilarity",
|
72
|
+
"validmind.model_validation.sklearn.PrecisionRecallCurve",
|
73
|
+
"validmind.model_validation.sklearn.ClassifierPerformance",
|
74
|
+
"validmind.model_validation.sklearn.VMeasure",
|
75
|
+
"validmind.model_validation.sklearn.MinimumF1Score",
|
76
|
+
"validmind.model_validation.sklearn.ROCCurve",
|
77
|
+
"validmind.model_validation.sklearn.RegressionR2Square",
|
78
|
+
"validmind.model_validation.sklearn.RegressionErrors",
|
79
|
+
"validmind.model_validation.sklearn.ClusterPerformance",
|
80
|
+
"validmind.model_validation.sklearn.TrainingTestDegradation",
|
81
|
+
"validmind.model_validation.sklearn.HyperParametersTuning",
|
82
|
+
"validmind.model_validation.sklearn.KMeansClustersOptimization",
|
83
|
+
"validmind.model_validation.sklearn.ModelsPerformanceComparison",
|
84
|
+
"validmind.model_validation.sklearn.WeakspotsDiagnosis",
|
85
|
+
"validmind.model_validation.sklearn.PopulationStabilityIndex",
|
86
|
+
"validmind.model_validation.sklearn.MinimumAccuracy",
|
87
|
+
"validmind.model_validation.statsmodels.RegressionModelsCoeffs",
|
88
|
+
"validmind.model_validation.statsmodels.BoxPierce",
|
89
|
+
"validmind.model_validation.statsmodels.RegressionCoeffsPlot",
|
90
|
+
"validmind.model_validation.statsmodels.RegressionModelSensitivityPlot",
|
91
|
+
"validmind.model_validation.statsmodels.RegressionModelForecastPlotLevels",
|
92
|
+
"validmind.model_validation.statsmodels.ScorecardHistogram",
|
93
|
+
"validmind.model_validation.statsmodels.LJungBox",
|
94
|
+
"validmind.model_validation.statsmodels.JarqueBera",
|
95
|
+
"validmind.model_validation.statsmodels.KolmogorovSmirnov",
|
96
|
+
"validmind.model_validation.statsmodels.ShapiroWilk",
|
97
|
+
"validmind.model_validation.statsmodels.CumulativePredictionProbabilities",
|
98
|
+
"validmind.model_validation.statsmodels.RegressionFeatureSignificance",
|
99
|
+
"validmind.model_validation.statsmodels.RegressionModelSummary",
|
100
|
+
"validmind.model_validation.statsmodels.Lilliefors",
|
101
|
+
"validmind.model_validation.statsmodels.RunsTest",
|
102
|
+
"validmind.model_validation.statsmodels.RegressionPermutationFeatureImportance",
|
103
|
+
"validmind.model_validation.statsmodels.PredictionProbabilitiesHistogram",
|
104
|
+
"validmind.model_validation.statsmodels.AutoARIMA",
|
105
|
+
"validmind.model_validation.statsmodels.GINITable",
|
106
|
+
"validmind.model_validation.statsmodels.RegressionModelForecastPlot",
|
107
|
+
"validmind.model_validation.statsmodels.DurbinWatsonTest",
|
108
|
+
"validmind.data_validation.MissingValuesRisk",
|
109
|
+
"validmind.data_validation.IQROutliersTable",
|
110
|
+
"validmind.data_validation.BivariateFeaturesBarPlots",
|
111
|
+
"validmind.data_validation.Skewness",
|
112
|
+
"validmind.data_validation.Duplicates",
|
113
|
+
"validmind.data_validation.MissingValuesBarPlot",
|
114
|
+
"validmind.data_validation.DatasetDescription",
|
115
|
+
"validmind.data_validation.ZivotAndrewsArch",
|
116
|
+
"validmind.data_validation.ScatterPlot",
|
117
|
+
"validmind.data_validation.TimeSeriesOutliers",
|
118
|
+
"validmind.data_validation.TabularCategoricalBarPlots",
|
119
|
+
"validmind.data_validation.AutoStationarity",
|
120
|
+
"validmind.data_validation.DescriptiveStatistics",
|
121
|
+
"validmind.data_validation.ANOVAOneWayTable",
|
122
|
+
"validmind.data_validation.TargetRateBarPlots",
|
123
|
+
"validmind.data_validation.PearsonCorrelationMatrix",
|
124
|
+
"validmind.data_validation.FeatureTargetCorrelationPlot",
|
125
|
+
"validmind.data_validation.TabularNumericalHistograms",
|
126
|
+
"validmind.data_validation.IsolationForestOutliers",
|
127
|
+
"validmind.data_validation.ChiSquaredFeaturesTable",
|
128
|
+
"validmind.data_validation.HighCardinality",
|
129
|
+
"validmind.data_validation.MissingValues",
|
130
|
+
"validmind.data_validation.PhillipsPerronArch",
|
131
|
+
"validmind.data_validation.RollingStatsPlot",
|
132
|
+
"validmind.data_validation.TabularDescriptionTables",
|
133
|
+
"validmind.data_validation.AutoMA",
|
134
|
+
"validmind.data_validation.UniqueRows",
|
135
|
+
"validmind.data_validation.TooManyZeroValues",
|
136
|
+
"validmind.data_validation.HighPearsonCorrelation",
|
137
|
+
"validmind.data_validation.ACFandPACFPlot",
|
138
|
+
"validmind.data_validation.BivariateHistograms",
|
139
|
+
"validmind.data_validation.WOEBinTable",
|
140
|
+
"validmind.data_validation.HeatmapFeatureCorrelations",
|
141
|
+
"validmind.data_validation.TimeSeriesFrequency",
|
142
|
+
"validmind.data_validation.DatasetSplit",
|
143
|
+
"validmind.data_validation.SpreadPlot",
|
144
|
+
"validmind.data_validation.TimeSeriesLinePlot",
|
145
|
+
"validmind.data_validation.KPSS",
|
146
|
+
"validmind.data_validation.AutoSeasonality",
|
147
|
+
"validmind.data_validation.BivariateScatterPlots",
|
148
|
+
"validmind.data_validation.EngleGrangerCoint",
|
149
|
+
"validmind.data_validation.TimeSeriesMissingValues",
|
150
|
+
"validmind.data_validation.TimeSeriesHistogram",
|
151
|
+
"validmind.data_validation.LaggedCorrelationHeatmap",
|
152
|
+
"validmind.data_validation.SeasonalDecompose",
|
153
|
+
"validmind.data_validation.WOEBinPlots",
|
154
|
+
"validmind.data_validation.ClassImbalance",
|
155
|
+
"validmind.data_validation.IQROutliersBarPlot",
|
156
|
+
"validmind.data_validation.DFGLSArch",
|
157
|
+
"validmind.data_validation.AutoAR",
|
158
|
+
"validmind.data_validation.TabularDateTimeHistograms",
|
159
|
+
"validmind.data_validation.ADF",
|
160
|
+
"validmind.data_validation.nlp.Toxicity",
|
161
|
+
"validmind.data_validation.nlp.PolarityAndSubjectivity",
|
162
|
+
"validmind.data_validation.nlp.Punctuations",
|
163
|
+
"validmind.data_validation.nlp.Sentiment",
|
164
|
+
"validmind.data_validation.nlp.CommonWords",
|
165
|
+
"validmind.data_validation.nlp.Hashtags",
|
166
|
+
"validmind.data_validation.nlp.LanguageDetection",
|
167
|
+
"validmind.data_validation.nlp.Mentions",
|
168
|
+
"validmind.data_validation.nlp.TextDescription",
|
169
|
+
"validmind.data_validation.nlp.StopWords",
|
170
|
+
]
|
@@ -2,9 +2,9 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import matplotlib.pyplot as plt
|
6
5
|
import pandas as pd
|
7
|
-
|
6
|
+
import plotly.graph_objects as go
|
7
|
+
from statsmodels.tsa.stattools import acf, pacf
|
8
8
|
|
9
9
|
from validmind.vm_models import Figure, Metric
|
10
10
|
|
@@ -77,37 +77,46 @@ class ACFandPACFPlot(Metric):
|
|
77
77
|
for col in df.columns:
|
78
78
|
series = df[col]
|
79
79
|
|
80
|
-
#
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
80
|
+
# Calculate the maximum number of lags based on the size of the dataset
|
81
|
+
max_lags = min(40, len(series) // 2 - 1)
|
82
|
+
|
83
|
+
# Calculate ACF and PACF values
|
84
|
+
acf_values = acf(series, nlags=max_lags)
|
85
|
+
pacf_values = pacf(series, nlags=max_lags)
|
86
|
+
|
87
|
+
# Create ACF plot using Plotly
|
88
|
+
acf_fig = go.Figure()
|
89
|
+
acf_fig.add_trace(go.Bar(x=list(range(len(acf_values))), y=acf_values))
|
90
|
+
acf_fig.update_layout(
|
91
|
+
title=f"ACF for {col}",
|
92
|
+
xaxis_title="Lag",
|
93
|
+
yaxis_title="ACF",
|
94
|
+
font=dict(size=18),
|
95
|
+
)
|
92
96
|
|
93
|
-
|
94
|
-
|
97
|
+
# Create PACF plot using Plotly
|
98
|
+
pacf_fig = go.Figure()
|
99
|
+
pacf_fig.add_trace(go.Bar(x=list(range(len(pacf_values))), y=pacf_values))
|
100
|
+
pacf_fig.update_layout(
|
101
|
+
title=f"PACF for {col}",
|
102
|
+
xaxis_title="Lag",
|
103
|
+
yaxis_title="PACF",
|
104
|
+
font=dict(size=18),
|
105
|
+
)
|
95
106
|
|
96
|
-
ax1.tick_params(axis="both", labelsize=18)
|
97
|
-
ax2.tick_params(axis="both", labelsize=18)
|
98
|
-
ax1.set_title(f"ACF for {col}", weight="bold", fontsize=20)
|
99
|
-
ax2.set_title(f"PACF for {col}", weight="bold", fontsize=20)
|
100
|
-
ax1.set_xlabel("Lag", fontsize=18)
|
101
|
-
ax2.set_xlabel("Lag", fontsize=18)
|
102
107
|
figures.append(
|
103
108
|
Figure(
|
104
109
|
for_object=self,
|
105
|
-
key=f"{self.key}:{col}",
|
106
|
-
figure=
|
110
|
+
key=f"{self.key}:{col}_acf",
|
111
|
+
figure=acf_fig,
|
112
|
+
)
|
113
|
+
)
|
114
|
+
figures.append(
|
115
|
+
Figure(
|
116
|
+
for_object=self,
|
117
|
+
key=f"{self.key}:{col}_pacf",
|
118
|
+
figure=pacf_fig,
|
107
119
|
)
|
108
120
|
)
|
109
|
-
|
110
|
-
# Do this if you want to prevent the figure from being displayed
|
111
|
-
plt.close("all")
|
112
121
|
|
113
122
|
return self.cache_results(figures=figures)
|
@@ -2,12 +2,18 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
5
|
+
from dataclasses import dataclass
|
6
|
+
|
7
|
+
import pandas as pd
|
6
8
|
from statsmodels.tsa.stattools import adfuller
|
7
9
|
|
10
|
+
from validmind.logging import get_logger
|
8
11
|
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
9
12
|
|
13
|
+
logger = get_logger(__name__)
|
14
|
+
|
10
15
|
|
16
|
+
@dataclass
|
11
17
|
class ADF(Metric):
|
12
18
|
"""
|
13
19
|
Assesses the stationarity of a time series dataset using the Augmented Dickey-Fuller (ADF) test.
|
@@ -53,7 +59,7 @@ class ADF(Metric):
|
|
53
59
|
}
|
54
60
|
|
55
61
|
def summary(self, metric_value: dict):
|
56
|
-
table = DataFrame.from_dict(metric_value, orient="index")
|
62
|
+
table = pd.DataFrame.from_dict(metric_value, orient="index")
|
57
63
|
table = table.reset_index()
|
58
64
|
table.columns = [
|
59
65
|
"Feature",
|
@@ -83,18 +89,41 @@ class ADF(Metric):
|
|
83
89
|
"""
|
84
90
|
dataset = self.inputs.dataset.df
|
85
91
|
|
92
|
+
# Check if the dataset is a time series
|
93
|
+
if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
|
94
|
+
raise ValueError(
|
95
|
+
"Dataset index must be a datetime or period index for time series analysis."
|
96
|
+
)
|
97
|
+
|
98
|
+
# Preprocessing: Drop rows with any NaN values
|
99
|
+
if dataset.isnull().values.any():
|
100
|
+
logger.warning(
|
101
|
+
"Dataset contains missing values. Rows with NaNs will be dropped."
|
102
|
+
)
|
103
|
+
dataset = dataset.dropna()
|
104
|
+
|
86
105
|
adf_values = {}
|
87
106
|
for col in dataset.columns:
|
88
|
-
|
89
|
-
dataset[col].values
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
107
|
+
try:
|
108
|
+
adf_result = adfuller(dataset[col].values)
|
109
|
+
adf_values[col] = {
|
110
|
+
"ADF Statistic": adf_result[0],
|
111
|
+
"P-Value": adf_result[1],
|
112
|
+
"Used Lag": adf_result[2],
|
113
|
+
"Number of Observations": adf_result[3],
|
114
|
+
"Critical Values": adf_result[4],
|
115
|
+
"IC Best": adf_result[5],
|
116
|
+
}
|
117
|
+
except Exception as e:
|
118
|
+
logger.error(f"Error processing column '{col}': {e}")
|
119
|
+
adf_values[col] = {
|
120
|
+
"ADF Statistic": None,
|
121
|
+
"P-Value": None,
|
122
|
+
"Used Lag": None,
|
123
|
+
"Number of Observations": None,
|
124
|
+
"Critical Values": None,
|
125
|
+
"IC Best": None,
|
126
|
+
"Error": str(e),
|
127
|
+
}
|
99
128
|
|
100
129
|
return self.cache_results(adf_values)
|
@@ -2,10 +2,10 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
+
import itertools
|
5
6
|
from dataclasses import dataclass
|
6
7
|
|
7
|
-
import
|
8
|
-
import seaborn as sns
|
8
|
+
import plotly.express as px
|
9
9
|
|
10
10
|
from validmind.vm_models import Figure, Metric
|
11
11
|
|
@@ -23,7 +23,7 @@ class BivariateScatterPlots(Metric):
|
|
23
23
|
biases and irregularities in the data.
|
24
24
|
|
25
25
|
**Test Mechanism**: This metric operates by creating a scatter plot for each pair of the selected features in the
|
26
|
-
dataset. If the parameters "
|
26
|
+
dataset. If the parameters "selected_columns" are not specified, an error will be thrown. The metric offers
|
27
27
|
flexibility by allowing the user to filter on a specific target class - specified by the "target_filter" parameter
|
28
28
|
- for more granified insights. Each scatterplot is then color-coded based on the category of the target variable
|
29
29
|
for better visual differentiation. The seaborn scatterplot library is used for generating the plots.
|
@@ -53,7 +53,7 @@ class BivariateScatterPlots(Metric):
|
|
53
53
|
|
54
54
|
name = "bivariate_scatter_plots"
|
55
55
|
required_inputs = ["dataset"]
|
56
|
-
default_params = {"
|
56
|
+
default_params = {"selected_columns": None}
|
57
57
|
metadata = {
|
58
58
|
"task_types": ["classification"],
|
59
59
|
"tags": [
|
@@ -65,52 +65,49 @@ class BivariateScatterPlots(Metric):
|
|
65
65
|
],
|
66
66
|
}
|
67
67
|
|
68
|
-
def plot_bivariate_scatter(self,
|
69
|
-
status_var = self.inputs.dataset.target_column
|
68
|
+
def plot_bivariate_scatter(self, columns):
|
70
69
|
figures = []
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
70
|
+
df = self.inputs.dataset.df
|
71
|
+
|
72
|
+
# Generate all pairs of columns
|
73
|
+
features_pairs = list(itertools.combinations(columns, 2))
|
74
|
+
|
75
|
+
for x, y in features_pairs:
|
76
|
+
fig = px.scatter(
|
77
|
+
df,
|
78
|
+
x=x,
|
79
|
+
y=y,
|
80
|
+
title=f"{x} and {y}",
|
81
|
+
labels={x: x, y: y},
|
82
|
+
opacity=0.7,
|
83
|
+
color_discrete_sequence=["blue"], # Use the same color for all points
|
83
84
|
)
|
84
|
-
|
85
|
-
# Change legend labels
|
86
|
-
legend_labels = [
|
87
|
-
"Category 1" if t.get_text() == "1" else "Category 2"
|
88
|
-
for t in plot.legend_.texts[1:]
|
89
|
-
]
|
90
|
-
plot.legend_.texts[1:] = legend_labels
|
91
|
-
|
92
|
-
plt.title(x + " and " + y)
|
93
|
-
plt.xlabel(x)
|
94
|
-
plt.ylabel(y)
|
95
|
-
plt.show()
|
85
|
+
fig.update_traces(marker=dict(color="blue"))
|
96
86
|
|
97
87
|
figures.append(
|
98
|
-
Figure(for_object=self, key=f"{self.key}:{x}_{y}", figure=
|
88
|
+
Figure(for_object=self, key=f"{self.key}:{x}_{y}", figure=fig)
|
99
89
|
)
|
100
90
|
|
101
|
-
plt.close("all")
|
102
|
-
|
103
91
|
return figures
|
104
92
|
|
105
93
|
def run(self):
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
94
|
+
selected_columns = self.params["selected_columns"]
|
95
|
+
|
96
|
+
if selected_columns is None:
|
97
|
+
# Use all columns if selected_columns is not provided
|
98
|
+
selected_columns = self.inputs.dataset.df.columns.tolist()
|
99
|
+
else:
|
100
|
+
# Check if all selected columns exist in the dataframe
|
101
|
+
missing_columns = [
|
102
|
+
col
|
103
|
+
for col in selected_columns
|
104
|
+
if col not in self.inputs.dataset.df.columns
|
105
|
+
]
|
106
|
+
if missing_columns:
|
107
|
+
raise ValueError(
|
108
|
+
f"The following selected columns are not in the dataframe: {missing_columns}"
|
109
|
+
)
|
113
110
|
|
114
|
-
figures = self.plot_bivariate_scatter(
|
111
|
+
figures = self.plot_bivariate_scatter(selected_columns)
|
115
112
|
|
116
113
|
return self.cache_results(figures=figures)
|
@@ -4,9 +4,14 @@
|
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
|
7
|
+
import pandas as pd
|
7
8
|
from arch.unitroot import DFGLS
|
9
|
+
from numpy.linalg import LinAlgError
|
8
10
|
|
9
|
-
from validmind.
|
11
|
+
from validmind.logging import get_logger
|
12
|
+
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
13
|
+
|
14
|
+
logger = get_logger(__name__)
|
10
15
|
|
11
16
|
|
12
17
|
@dataclass
|
@@ -59,14 +64,65 @@ class DFGLSArch(Metric):
|
|
59
64
|
"""
|
60
65
|
dataset = self.inputs.dataset.df
|
61
66
|
|
62
|
-
|
67
|
+
# Check if the dataset is a time series
|
68
|
+
if not isinstance(dataset.index, (pd.DatetimeIndex, pd.PeriodIndex)):
|
69
|
+
raise ValueError(
|
70
|
+
"Dataset index must be a datetime or period index for time series analysis."
|
71
|
+
)
|
72
|
+
|
73
|
+
# Preprocessing: Drop rows with any NaN values
|
74
|
+
if dataset.isnull().values.any():
|
75
|
+
logger.warning(
|
76
|
+
"Dataset contains missing values. Rows with NaNs will be dropped."
|
77
|
+
)
|
78
|
+
dataset = dataset.dropna()
|
79
|
+
|
80
|
+
# Convert to numeric and handle non-numeric data
|
81
|
+
dataset = dataset.apply(pd.to_numeric, errors="coerce")
|
82
|
+
|
83
|
+
# Initialize a list to store DFGLS results
|
84
|
+
dfgls_values = []
|
85
|
+
|
63
86
|
for col in dataset.columns:
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
87
|
+
try:
|
88
|
+
dfgls_out = DFGLS(dataset[col].values)
|
89
|
+
dfgls_values.append(
|
90
|
+
{
|
91
|
+
"Variable": col,
|
92
|
+
"stat": dfgls_out.stat,
|
93
|
+
"pvalue": dfgls_out.pvalue,
|
94
|
+
"usedlag": dfgls_out.lags,
|
95
|
+
"nobs": dfgls_out.nobs,
|
96
|
+
}
|
97
|
+
)
|
98
|
+
except LinAlgError as e:
|
99
|
+
logger.error(
|
100
|
+
f"SVD did not converge while processing column '{col}'. This could be due to numerical instability or multicollinearity. Error details: {e}"
|
101
|
+
)
|
102
|
+
dfgls_values.append(
|
103
|
+
{
|
104
|
+
"Variable": col,
|
105
|
+
"stat": None,
|
106
|
+
"pvalue": None,
|
107
|
+
"usedlag": None,
|
108
|
+
"nobs": None,
|
109
|
+
"error": str(e),
|
110
|
+
}
|
111
|
+
)
|
112
|
+
|
113
|
+
return self.cache_results({"dfgls_results": dfgls_values})
|
114
|
+
|
115
|
+
def summary(self, metric_value):
|
116
|
+
"""
|
117
|
+
Build a table for summarizing the DFGLS results
|
118
|
+
"""
|
119
|
+
dfgls_results = metric_value["dfgls_results"]
|
120
|
+
|
121
|
+
return ResultSummary(
|
122
|
+
results=[
|
123
|
+
ResultTable(
|
124
|
+
data=dfgls_results,
|
125
|
+
metadata=ResultTableMetadata(title="DFGLS Test Results"),
|
126
|
+
)
|
127
|
+
]
|
128
|
+
)
|
@@ -65,9 +65,18 @@ class HighPearsonCorrelation(ThresholdTest):
|
|
65
65
|
}
|
66
66
|
|
67
67
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
68
|
-
"""
|
69
|
-
|
70
|
-
|
68
|
+
"""The high pearson correlation test returns results like these:
|
69
|
+
[
|
70
|
+
{
|
71
|
+
"values": {
|
72
|
+
"correlations": [
|
73
|
+
{"column": "NumOfProducts", "correlation": -0.3044645622389459}
|
74
|
+
]
|
75
|
+
},
|
76
|
+
"column": "Balance",
|
77
|
+
"passed": false,
|
78
|
+
}
|
79
|
+
]
|
71
80
|
"""
|
72
81
|
results_table = [
|
73
82
|
{
|
@@ -64,7 +64,7 @@ class IsolationForestOutliers(Metric):
|
|
64
64
|
|
65
65
|
def run(self):
|
66
66
|
if self.params["features_columns"] is None:
|
67
|
-
features_list = self.inputs.dataset.
|
67
|
+
features_list = self.inputs.dataset.feature_columns_numeric
|
68
68
|
else:
|
69
69
|
features_list = self.params["features_columns"]
|
70
70
|
|
@@ -78,7 +78,7 @@ class IsolationForestOutliers(Metric):
|
|
78
78
|
+ "training dataset feature columns"
|
79
79
|
)
|
80
80
|
|
81
|
-
dataset = self.inputs.dataset.df
|
81
|
+
dataset = self.inputs.dataset.df[features_list]
|
82
82
|
|
83
83
|
# Training with isolation forest algorithm
|
84
84
|
clf = IsolationForest(
|