validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.8.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -5,10 +5,14 @@
|
|
5
5
|
import pandas as pd
|
6
6
|
import plotly.graph_objects as go
|
7
7
|
|
8
|
-
from validmind
|
8
|
+
from validmind import tags, tasks
|
9
|
+
from validmind.errors import SkipTestError
|
10
|
+
from validmind.vm_models import VMDataset
|
9
11
|
|
10
12
|
|
11
|
-
|
13
|
+
@tags("time_series_data", "visualization")
|
14
|
+
@tasks("regression")
|
15
|
+
def TimeSeriesLinePlot(dataset: VMDataset):
|
12
16
|
"""
|
13
17
|
Generates and analyses time-series data through line plots revealing trends, patterns, anomalies over time.
|
14
18
|
|
@@ -51,49 +55,27 @@ class TimeSeriesLinePlot(Metric):
|
|
51
55
|
- The metric has an inherent limitation in that it cannot extract deeper statistical insights from the time series
|
52
56
|
data, which can limit its efficacy with complex data structures and phenomena.
|
53
57
|
"""
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
# Creating the figure using Plotly
|
74
|
-
fig = go.Figure()
|
75
|
-
|
76
|
-
fig.add_trace(go.Scatter(x=df.index, y=df[col], mode="lines", name=col))
|
77
|
-
|
78
|
-
fig.update_layout(
|
79
|
-
title={
|
80
|
-
"text": f"{col}",
|
81
|
-
"y": 0.95,
|
82
|
-
"x": 0.5,
|
83
|
-
"xanchor": "center",
|
84
|
-
"yanchor": "top",
|
85
|
-
},
|
86
|
-
font=dict(size=16),
|
87
|
-
)
|
88
|
-
|
89
|
-
figures.append(
|
90
|
-
Figure(
|
91
|
-
for_object=self,
|
92
|
-
key=f"{self.key}:{col}",
|
93
|
-
figure=fig,
|
94
|
-
)
|
95
|
-
)
|
96
|
-
|
97
|
-
return self.cache_results(
|
98
|
-
figures=figures,
|
58
|
+
df = dataset.df
|
59
|
+
|
60
|
+
if not pd.api.types.is_datetime64_any_dtype(df.index):
|
61
|
+
raise SkipTestError("Index must be a datetime type")
|
62
|
+
|
63
|
+
figures = []
|
64
|
+
|
65
|
+
for col in dataset.feature_columns_numeric:
|
66
|
+
fig = go.Figure()
|
67
|
+
fig.add_trace(go.Scatter(x=df.index, y=df[col], mode="lines", name=col))
|
68
|
+
fig.update_layout(
|
69
|
+
title={
|
70
|
+
"text": col,
|
71
|
+
"y": 0.95,
|
72
|
+
"x": 0.5,
|
73
|
+
"xanchor": "center",
|
74
|
+
"yanchor": "top",
|
75
|
+
},
|
76
|
+
font=dict(size=16),
|
99
77
|
)
|
78
|
+
|
79
|
+
figures.append(fig)
|
80
|
+
|
81
|
+
return tuple(figures)
|
@@ -2,24 +2,18 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import pandas as pd
|
8
6
|
import plotly.express as px
|
9
7
|
import plotly.figure_factory as ff
|
10
8
|
|
11
|
-
from validmind
|
12
|
-
|
13
|
-
|
14
|
-
ResultTable,
|
15
|
-
ResultTableMetadata,
|
16
|
-
ThresholdTest,
|
17
|
-
ThresholdTestResult,
|
18
|
-
)
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.errors import SkipTestError
|
11
|
+
from validmind.vm_models import VMDataset
|
19
12
|
|
20
13
|
|
21
|
-
@
|
22
|
-
|
14
|
+
@tags("time_series_data")
|
15
|
+
@tasks("regression")
|
16
|
+
def TimeSeriesMissingValues(dataset: VMDataset, min_threshold: int = 1):
|
23
17
|
"""
|
24
18
|
Validates time-series data quality by confirming the count of missing values is below a certain threshold.
|
25
19
|
|
@@ -37,17 +31,11 @@ class TimeSeriesMissingValues(ThresholdTest):
|
|
37
31
|
dataset. An object for the test result is created stating whether the number of missing values is within the
|
38
32
|
specified threshold. Additionally, the test calculates the percentage of missing values alongside the raw count.
|
39
33
|
|
40
|
-
To aid in data visualization, the test generates two plots - a bar plot and a heatmap - to better illustrate the
|
41
|
-
distribution and quantity of missing values per variable. The test results, including a count of missing values,
|
42
|
-
the percentage of missing values, and a pass/fail status, are returned in a results table.
|
43
|
-
|
44
34
|
### Signs of High Risk
|
45
35
|
|
46
36
|
- The number of missing values in any column of the dataset surpasses the threshold, marking a failure and a
|
47
37
|
high-risk scenario. The reasons could range from incomplete data collection, faulty sensors to data preprocessing
|
48
38
|
errors.
|
49
|
-
- A continuous visual 'streak' in the heatmap may indicate a systematic error during data collection, pointing
|
50
|
-
towards another potential risk source.
|
51
39
|
|
52
40
|
### Strengths
|
53
41
|
|
@@ -55,7 +43,6 @@ class TimeSeriesMissingValues(ThresholdTest):
|
|
55
43
|
- Applicable and customizable through the threshold parameter across different data sets.
|
56
44
|
- Goes beyond raw numbers by calculating the percentage of missing values, offering a more relative understanding
|
57
45
|
of data scarcity.
|
58
|
-
- Includes a robust visualization mechanism for easy and fast understanding of data quality.
|
59
46
|
|
60
47
|
### Limitations
|
61
48
|
|
@@ -66,124 +53,61 @@ class TimeSeriesMissingValues(ThresholdTest):
|
|
66
53
|
overlook problematic data if set too loosely.
|
67
54
|
- Solely focuses on the 'missingness' of the data and might fall short in addressing other aspects of data quality.
|
68
55
|
"""
|
56
|
+
df = dataset.df
|
57
|
+
|
58
|
+
if not pd.api.types.is_datetime64_any_dtype(df.index):
|
59
|
+
raise SkipTestError("Dataset must be provided with datetime index")
|
69
60
|
|
70
|
-
|
71
|
-
required_inputs = ["dataset"]
|
72
|
-
default_params = {"min_threshold": 1}
|
73
|
-
tasks = ["regression"]
|
74
|
-
tags = ["time_series_data"]
|
61
|
+
missing = df.isna().sum()
|
75
62
|
|
76
|
-
|
77
|
-
|
63
|
+
if sum(missing.values) == 0:
|
64
|
+
# if theres no missing values, no need to plot anything
|
65
|
+
return [
|
66
|
+
{
|
67
|
+
"Column": col,
|
68
|
+
"Number of Missing Values": missing[col],
|
69
|
+
"Percentage of Missing Values (%)": 0,
|
70
|
+
"Pass/Fail": "Pass",
|
71
|
+
}
|
72
|
+
for col in missing.index
|
73
|
+
], True
|
74
|
+
|
75
|
+
barplot = px.bar(
|
76
|
+
missing,
|
77
|
+
x=missing.index,
|
78
|
+
y=missing.values,
|
79
|
+
labels={"x": "", "y": "Missing Values"},
|
80
|
+
title="Total Number of Missing Values per Variable",
|
81
|
+
color=missing.values,
|
82
|
+
color_continuous_scale="Reds",
|
83
|
+
)
|
84
|
+
|
85
|
+
missing_mask = df.isnull()
|
86
|
+
z = missing_mask.T.astype(int).values
|
87
|
+
x = missing_mask.index.tolist()
|
88
|
+
y = missing_mask.columns.tolist()
|
89
|
+
heatmap = ff.create_annotated_heatmap(
|
90
|
+
z=z,
|
91
|
+
x=x,
|
92
|
+
y=y,
|
93
|
+
colorscale="Reds",
|
94
|
+
showscale=False,
|
95
|
+
)
|
96
|
+
|
97
|
+
# Update the layout after creation
|
98
|
+
heatmap.update_layout(title="Missing Values Heatmap")
|
99
|
+
|
100
|
+
return (
|
101
|
+
[
|
78
102
|
{
|
79
|
-
"Column":
|
80
|
-
"Number of Missing Values":
|
81
|
-
"Percentage of Missing Values (%)":
|
82
|
-
"Pass/Fail": "Pass" if
|
103
|
+
"Column": col,
|
104
|
+
"Number of Missing Values": missing[col],
|
105
|
+
"Percentage of Missing Values (%)": missing[col] / df.shape[0] * 100,
|
106
|
+
"Pass/Fail": "Pass" if missing[col] < min_threshold else "Fail",
|
83
107
|
}
|
84
|
-
for result in results
|
85
|
-
]
|
86
|
-
return ResultSummary(
|
87
|
-
results=[
|
88
|
-
ResultTable(
|
89
|
-
data=results_table,
|
90
|
-
metadata=ResultTableMetadata(
|
91
|
-
title="Missing Values Results for Dataset"
|
92
|
-
),
|
93
|
-
)
|
94
|
-
]
|
95
|
-
)
|
96
|
-
|
97
|
-
def run(self):
|
98
|
-
df = self.inputs.dataset.df
|
99
|
-
|
100
|
-
# Check if the index of dataframe is datetime
|
101
|
-
is_datetime = pd.api.types.is_datetime64_any_dtype(df.index)
|
102
|
-
if not is_datetime:
|
103
|
-
raise ValueError("Dataset must be provided with datetime index")
|
104
|
-
|
105
|
-
# Validate threshold parameter
|
106
|
-
if "min_threshold" not in self.params:
|
107
|
-
raise ValueError("min_threshold must be provided in params")
|
108
|
-
min_threshold = self.params["min_threshold"]
|
109
|
-
|
110
|
-
rows = df.shape[0]
|
111
|
-
missing = df.isna().sum()
|
112
|
-
test_results = [
|
113
|
-
ThresholdTestResult(
|
114
|
-
column=col,
|
115
|
-
passed=missing[col] < min_threshold,
|
116
|
-
values={"n_missing": missing[col], "p_missing": missing[col] / rows},
|
117
|
-
)
|
118
108
|
for col in missing.index
|
119
|
-
]
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
if fig_barplot is not None:
|
125
|
-
test_figures.append(
|
126
|
-
Figure(
|
127
|
-
for_object=self,
|
128
|
-
key=f"{self.name}:barplot",
|
129
|
-
figure=fig_barplot,
|
130
|
-
metadata={"type": "barplot"},
|
131
|
-
)
|
132
|
-
)
|
133
|
-
test_figures.append(
|
134
|
-
Figure(
|
135
|
-
for_object=self,
|
136
|
-
key=f"{self.name}:heatmap",
|
137
|
-
figure=fig_heatmap,
|
138
|
-
metadata={"type": "heatmap"},
|
139
|
-
)
|
140
|
-
)
|
141
|
-
|
142
|
-
return self.cache_results(
|
143
|
-
test_results,
|
144
|
-
passed=all([r.passed for r in test_results]),
|
145
|
-
# Don't pass figures until we figure out how to group metric-figures for multiple
|
146
|
-
# executions inside a single test run
|
147
|
-
# figures=test_figures,
|
148
|
-
)
|
149
|
-
|
150
|
-
def _barplot(self, df):
|
151
|
-
"""
|
152
|
-
Generate a bar plot of missing values using Plotly.
|
153
|
-
"""
|
154
|
-
missing_values = df.isnull().sum()
|
155
|
-
if sum(missing_values.values) != 0:
|
156
|
-
fig = px.bar(
|
157
|
-
missing_values,
|
158
|
-
x=missing_values.index,
|
159
|
-
y=missing_values.values,
|
160
|
-
labels={"x": "", "y": "Missing Values"},
|
161
|
-
title="Total Number of Missing Values per Variable",
|
162
|
-
color=missing_values.values,
|
163
|
-
color_continuous_scale="Reds",
|
164
|
-
)
|
165
|
-
else:
|
166
|
-
fig = None
|
167
|
-
|
168
|
-
return fig
|
169
|
-
|
170
|
-
def _heatmap(self, df):
|
171
|
-
"""
|
172
|
-
Plots a heatmap to visualize missing values using Plotly.
|
173
|
-
"""
|
174
|
-
# Create a boolean mask for missing values
|
175
|
-
missing_mask = df.isnull()
|
176
|
-
z = missing_mask.T.astype(int).values # Convert boolean to int for heatmap
|
177
|
-
|
178
|
-
x = missing_mask.index.tolist()
|
179
|
-
y = missing_mask.columns.tolist()
|
180
|
-
|
181
|
-
if not x:
|
182
|
-
fig = ff.create_annotated_heatmap(
|
183
|
-
z=z, x=x, y=y, colorscale="Reds", showscale=False
|
184
|
-
)
|
185
|
-
fig.update_layout(title="Missing Values Heatmap")
|
186
|
-
else:
|
187
|
-
fig = None
|
188
|
-
|
189
|
-
return fig
|
109
|
+
],
|
110
|
+
barplot,
|
111
|
+
heatmap,
|
112
|
+
all(missing[col] < min_threshold for col in missing.index),
|
113
|
+
)
|
@@ -2,23 +2,17 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import pandas as pd
|
8
6
|
import plotly.graph_objects as go
|
9
7
|
|
10
|
-
from validmind
|
11
|
-
|
12
|
-
|
13
|
-
ResultTable,
|
14
|
-
ResultTableMetadata,
|
15
|
-
ThresholdTest,
|
16
|
-
ThresholdTestResult,
|
17
|
-
)
|
8
|
+
from validmind import tags, tasks
|
9
|
+
from validmind.errors import SkipTestError
|
10
|
+
from validmind.vm_models import VMDataset
|
18
11
|
|
19
12
|
|
20
|
-
@
|
21
|
-
|
13
|
+
@tags("time_series_data")
|
14
|
+
@tasks("regression")
|
15
|
+
def TimeSeriesOutliers(dataset: VMDataset, zscore_threshold: int = 3):
|
22
16
|
"""
|
23
17
|
Identifies and visualizes outliers in time-series data using the z-score method.
|
24
18
|
|
@@ -62,174 +56,63 @@ class TimeSeriesOutliers(ThresholdTest):
|
|
62
56
|
- It does not address possible ways to handle identified outliers in the data.
|
63
57
|
- The requirement for a datetime index could limit its application.
|
64
58
|
"""
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
variables = first_result.values["Variable"]
|
92
|
-
zScores = first_result.values["z-score"]
|
93
|
-
dates = first_result.values["Date"]
|
94
|
-
passFail = [
|
95
|
-
"Pass" if abs(z) < self.params["zscore_threshold"] else "Fail"
|
96
|
-
for z in zScores
|
97
|
-
]
|
98
|
-
|
99
|
-
return ResultSummary(
|
100
|
-
results=[
|
101
|
-
ResultTable(
|
102
|
-
# Sort by variable and then by date
|
103
|
-
data=pd.DataFrame(
|
104
|
-
{
|
105
|
-
"Variable": variables,
|
106
|
-
"Date": dates,
|
107
|
-
"z-Score": zScores,
|
108
|
-
"Pass/Fail": passFail,
|
109
|
-
}
|
110
|
-
).sort_values(["Variable", "Date"]),
|
111
|
-
metadata=ResultTableMetadata(
|
112
|
-
title="Outliers Results with z-Score Test"
|
113
|
-
),
|
59
|
+
df = dataset.df
|
60
|
+
|
61
|
+
if not pd.api.types.is_datetime64_any_dtype(df.index):
|
62
|
+
raise SkipTestError("Dataset must be provided with datetime index")
|
63
|
+
|
64
|
+
df_numeric = df[dataset.feature_columns_numeric]
|
65
|
+
z_scores = pd.DataFrame(
|
66
|
+
data=df_numeric.apply(lambda x: (x - x.mean()) / x.std()),
|
67
|
+
index=df.index,
|
68
|
+
columns=dataset.feature_columns_numeric,
|
69
|
+
)
|
70
|
+
|
71
|
+
outlier_table = []
|
72
|
+
outliers = z_scores[(z_scores.abs() > zscore_threshold).any(axis=1)]
|
73
|
+
|
74
|
+
for idx, row in outliers.iterrows():
|
75
|
+
for col in dataset.feature_columns_numeric:
|
76
|
+
if abs(row[col]) > zscore_threshold:
|
77
|
+
outlier_table.append(
|
78
|
+
{
|
79
|
+
"Column": col,
|
80
|
+
"Z-Score": row[col],
|
81
|
+
"Threshold": zscore_threshold,
|
82
|
+
"Date": idx.strftime("%Y-%m-%d"),
|
83
|
+
"Pass/Fail": "Fail",
|
84
|
+
}
|
114
85
|
)
|
115
|
-
]
|
116
|
-
)
|
117
|
-
|
118
|
-
def run(self):
|
119
|
-
# Initialize the test_results list
|
120
|
-
test_results = []
|
121
|
-
|
122
|
-
# Check if the index of dataframe is datetime
|
123
|
-
is_datetime = pd.api.types.is_datetime64_any_dtype(self.inputs.dataset.df.index)
|
124
|
-
if not is_datetime:
|
125
|
-
raise ValueError("Dataset must be provided with datetime index")
|
126
86
|
|
127
|
-
|
128
|
-
|
129
|
-
raise ValueError("zscore_threshold must be provided in params")
|
130
|
-
zscore_threshold = self.params["zscore_threshold"]
|
87
|
+
outlier_df = pd.DataFrame(outlier_table)
|
88
|
+
figures = []
|
131
89
|
|
132
|
-
|
133
|
-
|
90
|
+
for column in outlier_df["Column"].unique():
|
91
|
+
fig = go.Figure()
|
134
92
|
|
135
|
-
|
136
|
-
|
137
|
-
include=["number"]
|
138
|
-
).columns.tolist()
|
139
|
-
|
140
|
-
outliers_table = self.identify_outliers(
|
141
|
-
temp_df[num_features_columns], zscore_threshold
|
93
|
+
fig.add_trace(
|
94
|
+
go.Scatter(x=df.index, y=df[column], mode="lines", name="Time Series")
|
142
95
|
)
|
143
96
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
test_name="outliers",
|
153
|
-
passed=passed,
|
154
|
-
values=outliers_table.to_dict(orient="list"),
|
97
|
+
column_outliers = outlier_df[outlier_df["Column"] == column]
|
98
|
+
fig.add_trace(
|
99
|
+
go.Scatter(
|
100
|
+
x=pd.to_datetime(column_outliers["Date"]),
|
101
|
+
y=df.loc[pd.to_datetime(column_outliers["Date"]), column],
|
102
|
+
mode="markers",
|
103
|
+
marker=dict(color="red", size=10),
|
104
|
+
name="Outliers",
|
155
105
|
)
|
156
106
|
)
|
157
107
|
|
158
|
-
|
159
|
-
|
160
|
-
def z_score_with_na(self, df):
|
161
|
-
return df.apply(
|
162
|
-
lambda x: (x - x.mean()) / x.std() if x.dtype.kind in "biufc" else x
|
108
|
+
fig.update_layout(
|
109
|
+
title=f"Outliers for {column}", xaxis_title="Date", yaxis_title=column
|
163
110
|
)
|
164
111
|
|
165
|
-
|
166
|
-
"""
|
167
|
-
Identifies and returns outliers in a pandas DataFrame using the z-score method.
|
168
|
-
Args:
|
169
|
-
df (pandas.DataFrame): A pandas DataFrame containing the data to be analyzed.
|
170
|
-
threshold (float): The absolute value of the z-score above which a value is considered an outlier.
|
171
|
-
Returns:
|
172
|
-
pandas.DataFrame: A DataFrame containing the variables, z-scores, threshold, and dates of the identified outliers.
|
173
|
-
"""
|
174
|
-
z_scores = pd.DataFrame(
|
175
|
-
self.z_score_with_na(df), index=df.index, columns=df.columns
|
176
|
-
)
|
177
|
-
|
178
|
-
outliers = z_scores[(z_scores.abs() > threshold).any(axis=1)]
|
179
|
-
outlier_table = []
|
180
|
-
for idx, row in outliers.iterrows():
|
181
|
-
for col in df.columns:
|
182
|
-
if abs(row[col]) > threshold:
|
183
|
-
outlier_table.append(
|
184
|
-
{
|
185
|
-
"Variable": col,
|
186
|
-
"z-score": row[col],
|
187
|
-
"Threshold": threshold,
|
188
|
-
"Date": idx,
|
189
|
-
}
|
190
|
-
)
|
191
|
-
return pd.DataFrame(outlier_table)
|
192
|
-
|
193
|
-
def _plot_outliers(self, df, outliers_table):
|
194
|
-
"""
|
195
|
-
Plots time series with identified outliers.
|
196
|
-
Args:
|
197
|
-
df (pandas.DataFrame): Input data with time series.
|
198
|
-
outliers_table (pandas.DataFrame): DataFrame with identified outliers.
|
199
|
-
Returns:
|
200
|
-
list: A list of Figure objects with subplots for each variable.
|
201
|
-
"""
|
202
|
-
figures = []
|
203
|
-
|
204
|
-
for col in df.columns:
|
205
|
-
fig = go.Figure()
|
206
|
-
|
207
|
-
fig.add_trace(go.Scatter(x=df.index, y=df[col], mode="lines", name=col))
|
208
|
-
|
209
|
-
if not outliers_table.empty:
|
210
|
-
variable_outliers = outliers_table[outliers_table["Variable"] == col]
|
211
|
-
fig.add_trace(
|
212
|
-
go.Scatter(
|
213
|
-
x=variable_outliers["Date"],
|
214
|
-
y=df.loc[variable_outliers["Date"], col],
|
215
|
-
mode="markers",
|
216
|
-
marker=dict(color="red", size=10),
|
217
|
-
name="Outlier",
|
218
|
-
)
|
219
|
-
)
|
220
|
-
|
221
|
-
fig.update_layout(
|
222
|
-
title=f"Outliers for {col}",
|
223
|
-
xaxis_title="Date",
|
224
|
-
yaxis_title=col,
|
225
|
-
)
|
226
|
-
|
227
|
-
figures.append(
|
228
|
-
Figure(
|
229
|
-
for_object=self,
|
230
|
-
key=f"{self.name}:{col}_{self.inputs.dataset.input_id}",
|
231
|
-
figure=fig,
|
232
|
-
)
|
233
|
-
)
|
112
|
+
figures.append(fig)
|
234
113
|
|
235
|
-
|
114
|
+
return (
|
115
|
+
outlier_df.sort_values(["Column", "Date"]),
|
116
|
+
figures,
|
117
|
+
len(outlier_df) == 0,
|
118
|
+
)
|