validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.8.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -5,10 +5,14 @@
|
|
5
5
|
import pandas as pd
|
6
6
|
import plotly.graph_objects as go
|
7
7
|
|
8
|
-
from validmind
|
8
|
+
from validmind import tags, tasks
|
9
|
+
from validmind.errors import SkipTestError
|
10
|
+
from validmind.vm_models import VMDataset
|
9
11
|
|
10
12
|
|
11
|
-
|
13
|
+
@tags("time_series_data", "visualization")
|
14
|
+
@tasks("classification", "regression")
|
15
|
+
def TabularDateTimeHistograms(dataset: VMDataset):
|
12
16
|
"""
|
13
17
|
Generates histograms to provide graphical insight into the distribution of time intervals in a model's datetime
|
14
18
|
data.
|
@@ -52,46 +56,20 @@ class TabularDateTimeHistograms(Metric):
|
|
52
56
|
- The test is only applicable to datasets containing datetime columns and will fail if such columns are unavailable.
|
53
57
|
- The interpretation of the histograms relies heavily on the domain expertise and experience of the reviewer.
|
54
58
|
"""
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
date_diffs = df.index.to_series().sort_values().diff().dt.days.dropna()
|
73
|
-
|
74
|
-
# Filter out 0 values
|
75
|
-
date_diffs = date_diffs[date_diffs != 0]
|
76
|
-
|
77
|
-
# Create a histogram using Plotly
|
78
|
-
fig = go.Figure()
|
79
|
-
fig.add_trace(go.Histogram(x=date_diffs, nbinsx=30))
|
80
|
-
fig.update_layout(
|
81
|
-
title="Index",
|
82
|
-
xaxis_title="Days Between Consecutive Dates",
|
83
|
-
yaxis_title="Frequency",
|
84
|
-
font=dict(size=18),
|
85
|
-
)
|
86
|
-
|
87
|
-
figures.append(
|
88
|
-
Figure(
|
89
|
-
for_object=self,
|
90
|
-
key=f"{self.key}:index",
|
91
|
-
figure=fig,
|
92
|
-
)
|
93
|
-
)
|
94
|
-
|
95
|
-
return self.cache_results(
|
96
|
-
figures=figures,
|
97
|
-
)
|
59
|
+
df = dataset.df
|
60
|
+
if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
|
61
|
+
raise SkipTestError("Index must be a datetime type")
|
62
|
+
|
63
|
+
date_diffs = df.index.to_series().sort_values().diff().dt.days.dropna()
|
64
|
+
date_diffs = date_diffs[date_diffs != 0]
|
65
|
+
|
66
|
+
fig = go.Figure()
|
67
|
+
fig.add_trace(go.Histogram(x=date_diffs, nbinsx=30))
|
68
|
+
fig.update_layout(
|
69
|
+
title="Index",
|
70
|
+
xaxis_title="Days Between Consecutive Dates",
|
71
|
+
yaxis_title="Frequency",
|
72
|
+
font=dict(size=18),
|
73
|
+
)
|
74
|
+
|
75
|
+
return fig
|
@@ -2,13 +2,15 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import numpy as np
|
6
5
|
import plotly.graph_objs as go
|
7
6
|
|
8
|
-
from validmind
|
7
|
+
from validmind import tags, tasks
|
8
|
+
from validmind.vm_models import VMDataset
|
9
9
|
|
10
10
|
|
11
|
-
|
11
|
+
@tags("tabular_data", "visualization")
|
12
|
+
@tasks("classification", "regression")
|
13
|
+
def TabularNumericalHistograms(dataset: VMDataset):
|
12
14
|
"""
|
13
15
|
Generates histograms for each numerical feature in a dataset to provide visual insights into data distribution and
|
14
16
|
detect potential issues.
|
@@ -51,47 +53,26 @@ class TabularNumericalHistograms(Metric):
|
|
51
53
|
- Does not provide any insight into how these features affect the output of the model; it is purely an input
|
52
54
|
analysis tool.
|
53
55
|
"""
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
df =
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
fig.add_trace(
|
74
|
-
go.Histogram(x=df[col], nbinsx=50, name=col)
|
75
|
-
) # add histogram trace
|
76
|
-
fig.update_layout(
|
77
|
-
title_text=f"{col}", # title of plot
|
78
|
-
xaxis_title_text="", # xaxis label
|
79
|
-
yaxis_title_text="", # yaxis label
|
80
|
-
bargap=0.2, # gap between bars of adjacent location coordinates
|
81
|
-
bargroupgap=0.1, # gap between bars of the same location coordinates
|
82
|
-
autosize=False,
|
83
|
-
width=500,
|
84
|
-
height=500,
|
85
|
-
margin=dict(l=50, r=50, b=100, t=100, pad=4),
|
86
|
-
)
|
87
|
-
figures.append(
|
88
|
-
Figure(
|
89
|
-
for_object=self,
|
90
|
-
key=f"{self.key}:{col}",
|
91
|
-
figure=fig,
|
92
|
-
)
|
93
|
-
)
|
94
|
-
|
95
|
-
return self.cache_results(
|
96
|
-
figures=figures,
|
56
|
+
if len(dataset.feature_columns_numeric) == 0:
|
57
|
+
raise ValueError("No numerical columns found in the dataset")
|
58
|
+
|
59
|
+
df = dataset.df
|
60
|
+
figures = []
|
61
|
+
|
62
|
+
for col in dataset.feature_columns_numeric:
|
63
|
+
fig = go.Figure()
|
64
|
+
fig.add_trace(go.Histogram(x=df[col], nbinsx=50, name=col))
|
65
|
+
fig.update_layout(
|
66
|
+
title_text=f"{col}",
|
67
|
+
xaxis_title_text="",
|
68
|
+
yaxis_title_text="",
|
69
|
+
bargap=0.2,
|
70
|
+
bargroupgap=0.1,
|
71
|
+
autosize=False,
|
72
|
+
width=500,
|
73
|
+
height=500,
|
74
|
+
margin=dict(l=50, r=50, b=100, t=100, pad=4),
|
97
75
|
)
|
76
|
+
figures.append(fig)
|
77
|
+
|
78
|
+
return tuple(figures)
|
@@ -2,13 +2,18 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
+
import numpy as np
|
5
6
|
import plotly.graph_objs as go
|
6
7
|
from plotly.subplots import make_subplots
|
7
8
|
|
8
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.errors import SkipTestError
|
11
|
+
from validmind.vm_models import VMDataset
|
9
12
|
|
10
13
|
|
11
|
-
|
14
|
+
@tags("tabular_data", "visualization", "categorical_data")
|
15
|
+
@tasks("classification")
|
16
|
+
def TargetRateBarPlots(dataset: VMDataset):
|
12
17
|
"""
|
13
18
|
Generates bar plots visualizing the default rates of categorical features for a classification machine learning
|
14
19
|
model.
|
@@ -43,107 +48,63 @@ class TargetRateBarPlots(Metric):
|
|
43
48
|
|
44
49
|
### Limitations
|
45
50
|
|
46
|
-
- The test is less useful when dealing with numeric or continuous data, as it's designed specifically for
|
47
|
-
categorical features.
|
48
|
-
- If the model in question is dealing with a multi-class problem rather than binary classification, the test's
|
49
|
-
assumption of binary target values (0s and 1s) becomes a significant limitation.
|
50
51
|
- The readability of the bar plots drops as the number of distinct categories increases in the dataset, which can
|
51
52
|
make them harder to understand and less useful.
|
52
53
|
"""
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
tasks = ["classification"]
|
58
|
-
tags = ["tabular_data", "visualization", "categorical_data"]
|
59
|
-
|
60
|
-
def plot_loan_default_ratio(self, default_column, columns=None):
|
61
|
-
df = self.inputs.dataset.df
|
62
|
-
|
63
|
-
# Use all categorical features if columns is not specified, else use selected columns
|
64
|
-
if columns is None:
|
65
|
-
features = self.inputs.dataset.feature_columns_categorical
|
66
|
-
else:
|
67
|
-
features = columns
|
68
|
-
|
69
|
-
figures = []
|
70
|
-
for feature in features:
|
71
|
-
fig = make_subplots(
|
72
|
-
rows=1,
|
73
|
-
cols=2,
|
74
|
-
)
|
75
|
-
|
76
|
-
# Calculate counts and default rate for each category
|
77
|
-
counts = df[feature].value_counts()
|
78
|
-
default_rate = df.groupby(feature)[default_column].mean()
|
79
|
-
|
80
|
-
# Left plot: Counts
|
81
|
-
fig.add_trace(
|
82
|
-
go.Bar(
|
83
|
-
x=counts.index,
|
84
|
-
y=counts.values,
|
85
|
-
name="Counts",
|
86
|
-
marker_color="#6699cc",
|
87
|
-
),
|
88
|
-
row=1,
|
89
|
-
col=1,
|
90
|
-
)
|
91
|
-
|
92
|
-
# Right plot: Default rate
|
93
|
-
fig.add_trace(
|
94
|
-
go.Bar(
|
95
|
-
x=default_rate.index,
|
96
|
-
y=default_rate.values,
|
97
|
-
name="Target Rate",
|
98
|
-
marker_color="orange",
|
99
|
-
),
|
100
|
-
row=1,
|
101
|
-
col=2,
|
102
|
-
)
|
103
|
-
|
104
|
-
fig.update_layout(
|
105
|
-
title_text=f"{feature}", # title of plot
|
106
|
-
autosize=False,
|
107
|
-
width=500,
|
108
|
-
height=400,
|
109
|
-
margin=dict(l=50, r=50, b=100, t=100, pad=4),
|
110
|
-
)
|
111
|
-
|
112
|
-
figures.append(
|
113
|
-
Figure(
|
114
|
-
for_object=self,
|
115
|
-
key=f"{self.key}:{feature}",
|
116
|
-
figure=fig,
|
117
|
-
)
|
118
|
-
)
|
119
|
-
|
120
|
-
return self.cache_results(
|
121
|
-
figures=figures,
|
54
|
+
if np.unique(dataset.df[dataset.target_column]).size != 2:
|
55
|
+
raise SkipTestError(
|
56
|
+
f"Target column '{dataset.target_column}' is not binary. "
|
57
|
+
"This test only works for binary classification tasks."
|
122
58
|
)
|
123
59
|
|
124
|
-
|
125
|
-
|
126
|
-
raise ValueError("The default_column parameter needs to be specified.")
|
60
|
+
if len(dataset.feature_columns_categorical) == 0:
|
61
|
+
raise SkipTestError("No categorical columns found in the dataset")
|
127
62
|
|
128
|
-
|
129
|
-
|
63
|
+
df = dataset.df
|
64
|
+
figures = []
|
130
65
|
|
131
|
-
|
132
|
-
raise ValueError(
|
133
|
-
f"The column {default_column} is not binary. It contains: {unique_values}"
|
134
|
-
)
|
66
|
+
for col in dataset.feature_columns_categorical:
|
135
67
|
|
136
|
-
|
68
|
+
# Calculate counts and default rate for each category
|
69
|
+
counts = df[col].value_counts()
|
70
|
+
default_rate = df.groupby(col)[dataset.target_column].mean()
|
137
71
|
|
138
|
-
|
139
|
-
|
140
|
-
|
72
|
+
fig = make_subplots(
|
73
|
+
rows=1,
|
74
|
+
cols=2,
|
141
75
|
)
|
142
|
-
columns = self.params["columns"]
|
143
76
|
|
144
|
-
#
|
145
|
-
|
77
|
+
# Left plot: Counts
|
78
|
+
fig.add_trace(
|
79
|
+
go.Bar(
|
80
|
+
x=counts.index,
|
81
|
+
y=counts.values,
|
82
|
+
name="Counts",
|
83
|
+
marker_color="#6699cc",
|
84
|
+
),
|
85
|
+
row=1,
|
86
|
+
col=1,
|
87
|
+
)
|
88
|
+
# Right plot: Default rate
|
89
|
+
fig.add_trace(
|
90
|
+
go.Bar(
|
91
|
+
x=default_rate.index,
|
92
|
+
y=default_rate.values,
|
93
|
+
name="Target Rate",
|
94
|
+
marker_color="orange",
|
95
|
+
),
|
96
|
+
row=1,
|
97
|
+
col=2,
|
98
|
+
)
|
146
99
|
|
147
|
-
|
148
|
-
|
100
|
+
fig.update_layout(
|
101
|
+
title_text=col,
|
102
|
+
autosize=False,
|
103
|
+
width=500,
|
104
|
+
height=400,
|
105
|
+
margin=dict(l=50, r=50, b=100, t=100, pad=4),
|
149
106
|
)
|
107
|
+
|
108
|
+
figures.append(fig)
|
109
|
+
|
110
|
+
return tuple(figures)
|
@@ -2,23 +2,17 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import pandas as pd
|
8
6
|
import plotly.graph_objects as go
|
9
7
|
|
10
|
-
from validmind
|
11
|
-
|
12
|
-
|
13
|
-
ResultTable,
|
14
|
-
ResultTableMetadata,
|
15
|
-
ThresholdTest,
|
16
|
-
ThresholdTestResult,
|
17
|
-
)
|
8
|
+
from validmind import tags, tasks
|
9
|
+
from validmind.errors import SkipTestError
|
10
|
+
from validmind.vm_models import VMDataset
|
18
11
|
|
19
12
|
|
20
|
-
@
|
21
|
-
|
13
|
+
@tags("time_series_data")
|
14
|
+
@tasks("regression")
|
15
|
+
def TimeSeriesFrequency(dataset: VMDataset):
|
22
16
|
"""
|
23
17
|
Evaluates consistency of time series data frequency and generates a frequency plot.
|
24
18
|
|
@@ -63,129 +57,50 @@ class TimeSeriesFrequency(ThresholdTest):
|
|
63
57
|
- Depending on context or the model under development, mixed frequencies might sometimes be acceptable, but this
|
64
58
|
test considers them a failing condition.
|
65
59
|
"""
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
"""
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
"""
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
if not is_datetime:
|
106
|
-
raise ValueError("Dataset must be provided with datetime index")
|
107
|
-
|
108
|
-
freq_df = self._identify_frequencies(self.inputs.dataset.df)
|
109
|
-
n_frequencies = len(freq_df["Frequency"].unique())
|
110
|
-
test_results = [
|
111
|
-
ThresholdTestResult(
|
112
|
-
passed=n_frequencies == 1,
|
113
|
-
values=freq_df.to_dict(orient="list"),
|
114
|
-
)
|
115
|
-
]
|
116
|
-
fig_frequency = self._frequency_plot(self.inputs.dataset.df)
|
117
|
-
test_figures = []
|
118
|
-
test_figures.append(
|
119
|
-
Figure(
|
120
|
-
for_object=self,
|
121
|
-
key=f"{self.name}:frequencyplot",
|
122
|
-
figure=fig_frequency,
|
123
|
-
metadata={"type": "frequencyplot"},
|
124
|
-
)
|
125
|
-
)
|
126
|
-
return self.cache_results(
|
127
|
-
test_results,
|
128
|
-
passed=all([r.passed for r in test_results]),
|
129
|
-
figures=test_figures,
|
130
|
-
)
|
131
|
-
|
132
|
-
def _identify_frequencies(self, df):
|
133
|
-
"""
|
134
|
-
Identify the frequency of each series in the DataFrame.
|
135
|
-
:param df: Time-series DataFrame
|
136
|
-
:return: DataFrame with two columns: 'Variable' and 'Frequency'
|
137
|
-
"""
|
138
|
-
frequencies = []
|
139
|
-
freq_dict = {
|
140
|
-
"S": "Second",
|
141
|
-
"T": "Minute",
|
142
|
-
"min": "Minute",
|
143
|
-
"H": "Hourly",
|
144
|
-
"D": "Daily",
|
145
|
-
"B": "Business day",
|
146
|
-
"W": "Weekly",
|
147
|
-
"MS": "Monthly",
|
148
|
-
"M": "Monthly",
|
149
|
-
"Q": "Quarterly",
|
150
|
-
"A": "Yearly",
|
151
|
-
"Y": "Yearly",
|
152
|
-
}
|
153
|
-
|
154
|
-
for column in df.columns:
|
155
|
-
series = df[column].dropna()
|
156
|
-
if not series.empty:
|
157
|
-
freq = pd.infer_freq(series.index)
|
158
|
-
label = freq_dict.get(freq, freq)
|
159
|
-
else:
|
160
|
-
label = None
|
161
|
-
|
162
|
-
frequencies.append({"Variable": column, "Frequency": label})
|
163
|
-
|
164
|
-
freq_df = pd.DataFrame(frequencies)
|
165
|
-
|
166
|
-
return freq_df
|
167
|
-
|
168
|
-
def _frequency_plot(self, df):
|
169
|
-
"""
|
170
|
-
Creates a frequency plot of time differences between consecutive entries in a DataFrame index using Plotly.
|
171
|
-
Args:
|
172
|
-
df (pandas.DataFrame): The input DataFrame.
|
173
|
-
Returns:
|
174
|
-
A Plotly Figure object representing the frequency plot of time differences.
|
175
|
-
"""
|
176
|
-
# Calculate the time differences between consecutive entries
|
177
|
-
time_diff = df.index.to_series().diff().dropna()
|
178
|
-
|
179
|
-
# Convert the time differences to a suitable unit (e.g., days)
|
180
|
-
time_diff_days = time_diff.dt.total_seconds() / (60 * 60 * 24)
|
181
|
-
|
182
|
-
# Create a Plotly histogram
|
183
|
-
fig = go.Figure(data=[go.Histogram(x=time_diff_days, nbinsx=50)])
|
184
|
-
fig.update_layout(
|
60
|
+
df = dataset.df
|
61
|
+
|
62
|
+
if not pd.api.types.is_datetime64_any_dtype(df.index):
|
63
|
+
raise SkipTestError("Dataset must be provided with datetime index")
|
64
|
+
|
65
|
+
frequencies = []
|
66
|
+
freq_dict = {
|
67
|
+
"S": "Second",
|
68
|
+
"T": "Minute",
|
69
|
+
"min": "Minute",
|
70
|
+
"H": "Hourly",
|
71
|
+
"D": "Daily",
|
72
|
+
"B": "Business day",
|
73
|
+
"W": "Weekly",
|
74
|
+
"MS": "Monthly",
|
75
|
+
"M": "Monthly",
|
76
|
+
"Q": "Quarterly",
|
77
|
+
"A": "Yearly",
|
78
|
+
"Y": "Yearly",
|
79
|
+
}
|
80
|
+
|
81
|
+
for column in dataset.feature_columns_numeric:
|
82
|
+
series = df[column].dropna()
|
83
|
+
if not series.empty:
|
84
|
+
freq = pd.infer_freq(series.index)
|
85
|
+
label = freq_dict.get(freq, freq)
|
86
|
+
else:
|
87
|
+
label = None
|
88
|
+
|
89
|
+
frequencies.append({"Variable": column, "Frequency": label})
|
90
|
+
|
91
|
+
# Calculate the time differences between consecutive entries
|
92
|
+
time_diff = df.index.to_series().diff().dropna()
|
93
|
+
# Convert the time differences to a suitable unit (e.g., days)
|
94
|
+
time_diff_days = time_diff.dt.total_seconds() / (60 * 60 * 24)
|
95
|
+
# Plot the time differences as a histogram
|
96
|
+
fig = go.Figure(
|
97
|
+
data=[go.Histogram(x=time_diff_days, nbinsx=50)],
|
98
|
+
layout=go.Layout(
|
185
99
|
title="Histogram of Time Differences (Days)",
|
186
100
|
xaxis_title="Days",
|
187
101
|
yaxis_title="Frequency",
|
188
102
|
font=dict(size=16),
|
189
|
-
)
|
103
|
+
),
|
104
|
+
)
|
190
105
|
|
191
|
-
|
106
|
+
return frequencies, fig, len(set(item["Frequency"] for item in frequencies)) == 1
|
@@ -2,12 +2,16 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
+
import pandas as pd
|
5
6
|
import plotly.express as px
|
6
7
|
|
7
8
|
from validmind import tags, tasks
|
9
|
+
from validmind.logging import get_logger
|
8
10
|
|
11
|
+
logger = get_logger(__name__)
|
9
12
|
|
10
|
-
|
13
|
+
|
14
|
+
@tags("data_validation", "visualization", "time_series_data")
|
11
15
|
@tasks("regression", "time_series_forecasting")
|
12
16
|
def TimeSeriesHistogram(dataset, nbins=30):
|
13
17
|
"""
|
@@ -51,6 +55,9 @@ def TimeSeriesHistogram(dataset, nbins=30):
|
|
51
55
|
|
52
56
|
df = dataset.df
|
53
57
|
|
58
|
+
if not pd.api.types.is_datetime64_any_dtype(df.index):
|
59
|
+
raise ValueError(f"Dataset {dataset.input_id} must have a datetime index")
|
60
|
+
|
54
61
|
columns = list(dataset.df.columns)
|
55
62
|
|
56
63
|
if not set(columns).issubset(set(df.columns)):
|
@@ -58,12 +65,26 @@ def TimeSeriesHistogram(dataset, nbins=30):
|
|
58
65
|
|
59
66
|
figures = []
|
60
67
|
for col in columns:
|
68
|
+
# Check for missing values and log if any are found
|
69
|
+
missing_count = df[col].isna().sum()
|
70
|
+
if missing_count > 0:
|
71
|
+
logger.info(
|
72
|
+
f"Column '{col}' contains {missing_count} missing values which will be excluded from the histogram."
|
73
|
+
)
|
74
|
+
|
75
|
+
# Drop missing values for the current column
|
76
|
+
valid_data = df[~df[col].isna()]
|
77
|
+
|
61
78
|
fig = px.histogram(
|
62
|
-
|
79
|
+
valid_data,
|
80
|
+
x=col,
|
81
|
+
marginal="violin",
|
82
|
+
nbins=nbins,
|
83
|
+
title=f"Histogram for {col}",
|
63
84
|
)
|
64
85
|
fig.update_layout(
|
65
86
|
title={
|
66
|
-
"text": f"{col}",
|
87
|
+
"text": f"{col} (n={len(valid_data)})",
|
67
88
|
"y": 0.9,
|
68
89
|
"x": 0.5,
|
69
90
|
"xanchor": "center",
|