validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.24.dist-info/METADATA +0 -118
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,27 +2,165 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
6
|
-
from functools import partial
|
7
|
-
from typing import List
|
5
|
+
from typing import Callable, Dict, List, Tuple, Union
|
8
6
|
|
9
7
|
import matplotlib.pyplot as plt
|
10
8
|
import pandas as pd
|
11
9
|
import seaborn as sns
|
12
10
|
from sklearn import metrics
|
13
11
|
|
14
|
-
from validmind.
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
12
|
+
from validmind.tests import tags, tasks
|
13
|
+
from validmind.vm_models import VMDataset, VMModel
|
14
|
+
|
15
|
+
DEFAULT_METRICS = {
|
16
|
+
"accuracy": metrics.accuracy_score,
|
17
|
+
"precision": metrics.precision_score,
|
18
|
+
"recall": metrics.recall_score,
|
19
|
+
"f1": metrics.f1_score,
|
20
|
+
}
|
21
|
+
DEFAULT_THRESHOLDS = {
|
22
|
+
"accuracy": 0.75,
|
23
|
+
"precision": 0.5,
|
24
|
+
"recall": 0.5,
|
25
|
+
"f1": 0.7,
|
26
|
+
}
|
27
|
+
|
28
|
+
|
29
|
+
def _compute_metrics(
|
30
|
+
results: dict,
|
31
|
+
metrics: Dict[str, Callable],
|
32
|
+
region: str,
|
33
|
+
df_region: pd.DataFrame,
|
34
|
+
target_column: str,
|
35
|
+
prediction_column: str,
|
36
|
+
feature_column: str,
|
37
|
+
) -> None:
|
38
|
+
"""
|
39
|
+
Computes and appends the default metrics for a given DataFrame slice to the results dictionary.
|
40
|
+
Args:
|
41
|
+
results (dict): A dictionary to which the computed metrics will be appended.
|
42
|
+
region (str): A string identifier for the DataFrame slice being evaluated.
|
43
|
+
df_region (pd.DataFrame): A pandas DataFrame slice containing the data to evaluate.
|
44
|
+
target_column (str): The name of the target column to use for computing the metrics.
|
45
|
+
prediction_column (str): The name of the prediction column to use for computing the metrics.
|
46
|
+
Returns:
|
47
|
+
None: The computed metrics are appended to the `results` dictionary in-place.
|
48
|
+
"""
|
49
|
+
results["Slice"].append(str(region))
|
50
|
+
results["Shape"].append(df_region.shape[0])
|
51
|
+
results["Feature"].append(feature_column)
|
52
|
+
|
53
|
+
# Check if df_region is an empty dataframe and if so, append 0 to all metrics
|
54
|
+
if df_region.empty:
|
55
|
+
for metric in metrics.keys():
|
56
|
+
results[metric].append(0)
|
57
|
+
return
|
22
58
|
|
59
|
+
y_true = df_region[target_column].values
|
60
|
+
y_prediction = (
|
61
|
+
df_region[prediction_column].astype(df_region[target_column].dtypes).values
|
62
|
+
)
|
23
63
|
|
24
|
-
|
25
|
-
|
64
|
+
for metric, metric_fn in metrics.items():
|
65
|
+
results[metric].append(metric_fn(y_true, y_prediction))
|
66
|
+
|
67
|
+
|
68
|
+
def _plot_weak_spots(
|
69
|
+
results_1: dict, results_2: dict, feature_column: str, metric: str, threshold: float
|
70
|
+
) -> Tuple[plt.Figure, pd.DataFrame]:
|
71
|
+
"""
|
72
|
+
Plots the metric of the training and test datasets for each region in a given feature column,
|
73
|
+
and highlights regions where the score is below a specified threshold.
|
74
|
+
Args:
|
75
|
+
results_1 (list of dict): The results of the model on the training dataset, as a list of dictionaries.
|
76
|
+
results_2 (list of dict): The results of the model on the test dataset, as a list of dictionaries.
|
77
|
+
feature_column (str): The name of the feature column being analyzed.
|
78
|
+
metric (str): The name of the metric to plot.
|
79
|
+
threshold (float): The minimum accuracy threshold to be highlighted on the plot.
|
80
|
+
Returns:
|
81
|
+
fig (matplotlib.figure.Figure): The figure object containing the plot.
|
82
|
+
df (pandas.DataFrame): The concatenated dataframe containing the training and test results.
|
83
|
+
"""
|
84
|
+
# Concat training and test datasets
|
85
|
+
results_1 = pd.DataFrame(results_1)
|
86
|
+
results_2 = pd.DataFrame(results_2)
|
87
|
+
dataset_type_column = "Dataset Type"
|
88
|
+
results_1[dataset_type_column] = "Training"
|
89
|
+
results_2[dataset_type_column] = "Test"
|
90
|
+
df = pd.concat([results_1, results_2])
|
91
|
+
|
92
|
+
# Create a bar plot using seaborn library
|
93
|
+
fig, ax = plt.subplots()
|
94
|
+
barplot = sns.barplot(
|
95
|
+
data=df,
|
96
|
+
x="Slice",
|
97
|
+
y=metric,
|
98
|
+
hue=dataset_type_column,
|
99
|
+
edgecolor="black",
|
100
|
+
ax=ax,
|
101
|
+
)
|
102
|
+
ax.tick_params(axis="x", rotation=90)
|
103
|
+
for p in ax.patches:
|
104
|
+
t = ax.annotate(
|
105
|
+
str("{:.2f}%".format(p.get_height())),
|
106
|
+
xy=(p.get_x() + 0.03, p.get_height() + 1),
|
107
|
+
)
|
108
|
+
t.set(color="black", size=14)
|
109
|
+
|
110
|
+
axhline = ax.axhline(
|
111
|
+
y=threshold,
|
112
|
+
color="red",
|
113
|
+
linestyle="--",
|
114
|
+
linewidth=3,
|
115
|
+
label=f"Threshold: {threshold}",
|
116
|
+
)
|
117
|
+
ax.set_ylabel(metric.capitalize(), weight="bold", fontsize=18)
|
118
|
+
ax.set_xlabel("Slice/Segments", weight="bold", fontsize=18)
|
119
|
+
ax.set_title(
|
120
|
+
f"Weak regions in feature column: {feature_column}",
|
121
|
+
weight="bold",
|
122
|
+
fontsize=20,
|
123
|
+
wrap=True,
|
124
|
+
)
|
125
|
+
|
126
|
+
# Get the legend handles and labels from the barplot
|
127
|
+
handles, labels = barplot.get_legend_handles_labels()
|
128
|
+
|
129
|
+
# Append the axhline handle and label
|
130
|
+
handles.append(axhline)
|
131
|
+
labels.append(axhline.get_label())
|
132
|
+
|
133
|
+
# Create a legend with both hue and axhline labels, the threshold line
|
134
|
+
# will show up twice so remove the first element
|
135
|
+
# barplot.legend(handles=handles[:-1], labels=labels, loc="upper right")
|
136
|
+
barplot.legend(
|
137
|
+
handles=handles[:-1],
|
138
|
+
labels=labels[:-1],
|
139
|
+
loc="upper center",
|
140
|
+
bbox_to_anchor=(0.5, 0.1),
|
141
|
+
ncol=len(handles) - 1,
|
142
|
+
)
|
143
|
+
|
144
|
+
plt.close()
|
145
|
+
|
146
|
+
return fig, df
|
147
|
+
|
148
|
+
|
149
|
+
@tags(
|
150
|
+
"sklearn",
|
151
|
+
"binary_classification",
|
152
|
+
"multiclass_classification",
|
153
|
+
"model_diagnosis",
|
154
|
+
"visualization",
|
155
|
+
)
|
156
|
+
@tasks("classification", "text_classification")
|
157
|
+
def WeakspotsDiagnosis(
|
158
|
+
datasets: List[VMDataset],
|
159
|
+
model: VMModel,
|
160
|
+
features_columns: Union[List[str], None] = None,
|
161
|
+
metrics: Union[Dict[str, Callable], None] = None,
|
162
|
+
thresholds: Union[Dict[str, float], None] = None,
|
163
|
+
):
|
26
164
|
"""
|
27
165
|
Identifies and visualizes weak spots in a machine learning model's performance across various sections of the
|
28
166
|
feature space.
|
@@ -72,282 +210,88 @@ class WeakspotsDiagnosis(ThresholdTest):
|
|
72
210
|
- Despite its usefulness in highlighting problematic regions, the test does not offer direct suggestions for model
|
73
211
|
improvement.
|
74
212
|
"""
|
213
|
+
feature_columns = features_columns or datasets[0].feature_columns
|
214
|
+
if not all(col in datasets[0].feature_columns for col in feature_columns):
|
215
|
+
raise ValueError(
|
216
|
+
"Column(s) provided in features_columns do not exist in the dataset"
|
217
|
+
)
|
75
218
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
default_params = {
|
80
|
-
"features_columns": None,
|
81
|
-
# Some default values that the user should override
|
82
|
-
"thresholds": {
|
83
|
-
"accuracy": 0.75,
|
84
|
-
"precision": 0.5,
|
85
|
-
"recall": 0.5,
|
86
|
-
"f1": 0.7,
|
87
|
-
},
|
88
|
-
}
|
89
|
-
|
90
|
-
tasks = ["classification", "text_classification"]
|
91
|
-
tags = [
|
92
|
-
"sklearn",
|
93
|
-
"binary_classification",
|
94
|
-
"multiclass_classification",
|
95
|
-
"model_diagnosis",
|
96
|
-
"visualization",
|
97
|
-
]
|
219
|
+
metrics = metrics or DEFAULT_METRICS
|
220
|
+
metrics = {k.title(): v for k, v in metrics.items()}
|
98
221
|
|
99
|
-
|
100
|
-
|
101
|
-
"accuracy": metrics.accuracy_score,
|
102
|
-
"precision": partial(metrics.precision_score, zero_division=0),
|
103
|
-
"recall": partial(metrics.recall_score, zero_division=0),
|
104
|
-
"f1": partial(metrics.f1_score, zero_division=0),
|
105
|
-
}
|
106
|
-
|
107
|
-
def run(self):
|
108
|
-
thresholds = self.params["thresholds"]
|
109
|
-
|
110
|
-
# Ensure there is a threshold for each metric
|
111
|
-
for metric in self.default_metrics.keys():
|
112
|
-
if metric not in thresholds:
|
113
|
-
raise ValueError(f"Threshold for metric {metric} is missing")
|
114
|
-
|
115
|
-
if self.params["features_columns"] is None:
|
116
|
-
features_list = self.inputs.datasets[0].feature_columns
|
117
|
-
else:
|
118
|
-
features_list = self.params["features_columns"]
|
119
|
-
|
120
|
-
if self.inputs.datasets[0].text_column in features_list:
|
121
|
-
raise ValueError(
|
122
|
-
"Skiping Weakspots Diagnosis test for the dataset with text column"
|
123
|
-
)
|
222
|
+
thresholds = thresholds or DEFAULT_THRESHOLDS
|
223
|
+
thresholds = {k.title(): v for k, v in thresholds.items()}
|
124
224
|
|
125
|
-
|
126
|
-
|
127
|
-
elem in self.inputs.datasets[0].feature_columns for elem in features_list
|
128
|
-
)
|
129
|
-
if not all_present:
|
130
|
-
raise ValueError(
|
131
|
-
"The list of feature columns provided do not match with "
|
132
|
-
+ "training dataset feature columns"
|
133
|
-
)
|
225
|
+
results_headers = ["Slice", "Shape", "Feature"]
|
226
|
+
results_headers.extend(metrics.keys())
|
134
227
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
train_df = self.inputs.datasets[0].df.copy()
|
139
|
-
train_class_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
|
140
|
-
train_df[prediction_column] = train_class_pred
|
141
|
-
|
142
|
-
test_df = self.inputs.datasets[1].df.copy()
|
143
|
-
test_class_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
|
144
|
-
test_df[prediction_column] = test_class_pred
|
145
|
-
|
146
|
-
test_results = []
|
147
|
-
test_figures = []
|
148
|
-
results_headers = ["slice", "shape", "feature"]
|
149
|
-
results_headers.extend(self.default_metrics.keys())
|
150
|
-
for feature in features_list:
|
151
|
-
bins = 10
|
152
|
-
if feature in self.inputs.datasets[0].feature_columns_categorical:
|
153
|
-
bins = len(train_df[feature].unique())
|
154
|
-
train_df["bin"] = pd.cut(train_df[feature], bins=bins)
|
155
|
-
|
156
|
-
results_train = {k: [] for k in results_headers}
|
157
|
-
results_test = {k: [] for k in results_headers}
|
158
|
-
|
159
|
-
for region, df_region in train_df.groupby("bin"):
|
160
|
-
self._compute_metrics(
|
161
|
-
results_train,
|
162
|
-
region,
|
163
|
-
df_region,
|
164
|
-
target_column,
|
165
|
-
prediction_column,
|
166
|
-
feature,
|
167
|
-
)
|
168
|
-
df_test_region = test_df[
|
169
|
-
(test_df[feature] > region.left)
|
170
|
-
& (test_df[feature] <= region.right)
|
171
|
-
]
|
172
|
-
self._compute_metrics(
|
173
|
-
results_test,
|
174
|
-
region,
|
175
|
-
df_test_region,
|
176
|
-
target_column,
|
177
|
-
prediction_column,
|
178
|
-
feature,
|
179
|
-
)
|
180
|
-
|
181
|
-
# Make one plot per metric
|
182
|
-
for metric in self.default_metrics.keys():
|
183
|
-
fig, df = self._plot_weak_spots(
|
184
|
-
results_train,
|
185
|
-
results_test,
|
186
|
-
feature,
|
187
|
-
metric=metric,
|
188
|
-
threshold=thresholds[metric],
|
189
|
-
)
|
190
|
-
|
191
|
-
test_figures.append(
|
192
|
-
Figure(
|
193
|
-
for_object=self,
|
194
|
-
key=f"{self.name}:{metric}:{feature}",
|
195
|
-
figure=fig,
|
196
|
-
metadata={
|
197
|
-
"metric": metric,
|
198
|
-
"threshold": thresholds[metric],
|
199
|
-
"feature": feature,
|
200
|
-
},
|
201
|
-
)
|
202
|
-
)
|
203
|
-
|
204
|
-
# For simplicity, test has failed if any of the metrics is below the threshold. We will
|
205
|
-
# rely on visual assessment for this test for now.
|
206
|
-
results_passed = df[df[list(thresholds.keys())].lt(thresholds).any(axis=1)]
|
207
|
-
passed = results_passed.empty
|
208
|
-
|
209
|
-
test_results.append(
|
210
|
-
ThresholdTestResult(
|
211
|
-
test_name="accuracy",
|
212
|
-
column=feature,
|
213
|
-
passed=passed,
|
214
|
-
values={"records": df.to_dict("records")},
|
215
|
-
)
|
216
|
-
)
|
217
|
-
return self.cache_results(
|
218
|
-
test_results,
|
219
|
-
passed=all([r.passed for r in test_results]),
|
220
|
-
figures=test_figures,
|
221
|
-
)
|
228
|
+
figures = []
|
229
|
+
passed = True
|
222
230
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
metadata=ResultTableMetadata(title="Weakspots Test"),
|
232
|
-
)
|
233
|
-
]
|
234
|
-
)
|
231
|
+
df_1 = datasets[0]._df[
|
232
|
+
feature_columns
|
233
|
+
+ [datasets[0].target_column, datasets[0].prediction_column(model)]
|
234
|
+
]
|
235
|
+
df_2 = datasets[1]._df[
|
236
|
+
feature_columns
|
237
|
+
+ [datasets[1].target_column, datasets[1].prediction_column(model)]
|
238
|
+
]
|
235
239
|
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
""
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
y_prediction = (
|
268
|
-
df_region[prediction_column].astype(df_region[target_column].dtypes).values
|
269
|
-
)
|
240
|
+
for feature in feature_columns:
|
241
|
+
bins = 10
|
242
|
+
if feature in datasets[0].feature_columns_categorical:
|
243
|
+
bins = len(df_1[feature].unique())
|
244
|
+
df_1["bin"] = pd.cut(df_1[feature], bins=bins)
|
245
|
+
|
246
|
+
results_1 = {k: [] for k in results_headers}
|
247
|
+
results_2 = {k: [] for k in results_headers}
|
248
|
+
|
249
|
+
for region, df_region in df_1.groupby("bin"):
|
250
|
+
_compute_metrics(
|
251
|
+
results=results_1,
|
252
|
+
metrics=metrics,
|
253
|
+
region=region,
|
254
|
+
df_region=df_region,
|
255
|
+
target_column=datasets[0].target_column,
|
256
|
+
prediction_column=datasets[0].prediction_column(model),
|
257
|
+
feature_column=feature,
|
258
|
+
)
|
259
|
+
df_2_region = df_2[
|
260
|
+
(df_2[feature] > region.left) & (df_2[feature] <= region.right)
|
261
|
+
]
|
262
|
+
_compute_metrics(
|
263
|
+
results=results_2,
|
264
|
+
metrics=metrics,
|
265
|
+
region=region,
|
266
|
+
df_region=df_2_region,
|
267
|
+
target_column=datasets[1].target_column,
|
268
|
+
prediction_column=datasets[1].prediction_column(model),
|
269
|
+
feature_column=feature,
|
270
|
+
)
|
270
271
|
|
271
|
-
for metric
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
Plots the metric of the training and test datasets for each region in a given feature column,
|
279
|
-
and highlights regions where the score is below a specified threshold.
|
280
|
-
Args:
|
281
|
-
results_train (list of dict): The results of the model on the training dataset, as a list of dictionaries.
|
282
|
-
results_test (list of dict): The results of the model on the test dataset, as a list of dictionaries.
|
283
|
-
feature_column (str): The name of the feature column being analyzed.
|
284
|
-
metric (str): The name of the metric to plot.
|
285
|
-
threshold (float): The minimum accuracy threshold to be highlighted on the plot.
|
286
|
-
Returns:
|
287
|
-
fig (matplotlib.figure.Figure): The figure object containing the plot.
|
288
|
-
df (pandas.DataFrame): The concatenated dataframe containing the training and test results.
|
289
|
-
"""
|
290
|
-
# Concat training and test datasets
|
291
|
-
results_train = pd.DataFrame(results_train)
|
292
|
-
results_test = pd.DataFrame(results_test)
|
293
|
-
dataset_type_column = "Dataset Type"
|
294
|
-
results_train[dataset_type_column] = "Training"
|
295
|
-
results_test[dataset_type_column] = "Test"
|
296
|
-
df = pd.concat([results_train, results_test])
|
297
|
-
|
298
|
-
# Create a bar plot using seaborn library
|
299
|
-
fig, ax = plt.subplots()
|
300
|
-
barplot = sns.barplot(
|
301
|
-
data=df,
|
302
|
-
x="slice",
|
303
|
-
y=metric,
|
304
|
-
hue=dataset_type_column,
|
305
|
-
edgecolor="black",
|
306
|
-
ax=ax,
|
307
|
-
)
|
308
|
-
ax.tick_params(axis="x", rotation=90)
|
309
|
-
for p in ax.patches:
|
310
|
-
t = ax.annotate(
|
311
|
-
str("{:.2f}%".format(p.get_height())),
|
312
|
-
xy=(p.get_x() + 0.03, p.get_height() + 1),
|
272
|
+
for metric in metrics.keys():
|
273
|
+
fig, df = _plot_weak_spots(
|
274
|
+
results_1=results_1,
|
275
|
+
results_2=results_2,
|
276
|
+
feature_column=feature,
|
277
|
+
metric=metric,
|
278
|
+
threshold=thresholds[metric],
|
313
279
|
)
|
314
|
-
t.set(color="black", size=14)
|
315
|
-
|
316
|
-
axhline = ax.axhline(
|
317
|
-
y=threshold,
|
318
|
-
color="red",
|
319
|
-
linestyle="--",
|
320
|
-
linewidth=3,
|
321
|
-
label=f"Threshold: {threshold}",
|
322
|
-
)
|
323
|
-
ax.set_ylabel(metric.capitalize(), weight="bold", fontsize=18)
|
324
|
-
ax.set_xlabel("Slice/Segments", weight="bold", fontsize=18)
|
325
|
-
ax.set_title(
|
326
|
-
f"Weak regions in feature column: {feature_column}",
|
327
|
-
weight="bold",
|
328
|
-
fontsize=20,
|
329
|
-
wrap=True,
|
330
|
-
)
|
331
280
|
|
332
|
-
|
333
|
-
handles, labels = barplot.get_legend_handles_labels()
|
334
|
-
|
335
|
-
# Append the axhline handle and label
|
336
|
-
handles.append(axhline)
|
337
|
-
labels.append(axhline.get_label())
|
338
|
-
|
339
|
-
# Create a legend with both hue and axhline labels, the threshold line
|
340
|
-
# will show up twice so remove the first element
|
341
|
-
# barplot.legend(handles=handles[:-1], labels=labels, loc="upper right")
|
342
|
-
barplot.legend(
|
343
|
-
handles=handles[:-1],
|
344
|
-
labels=labels[:-1],
|
345
|
-
loc="upper center",
|
346
|
-
bbox_to_anchor=(0.5, 0.1),
|
347
|
-
ncol=len(handles) - 1,
|
348
|
-
)
|
281
|
+
figures.append(fig)
|
349
282
|
|
350
|
-
#
|
351
|
-
|
283
|
+
# For simplicity, test has failed if any of the metrics is below the threshold. We will
|
284
|
+
# rely on visual assessment for this test for now.
|
285
|
+
if not df[df[list(thresholds.keys())].lt(thresholds).any(axis=1)].empty:
|
286
|
+
passed = False
|
352
287
|
|
353
|
-
|
288
|
+
return (
|
289
|
+
pd.concat(
|
290
|
+
[
|
291
|
+
pd.DataFrame(results_1).assign(Dataset=datasets[0].input_id),
|
292
|
+
pd.DataFrame(results_2).assign(Dataset=datasets[1].input_id),
|
293
|
+
]
|
294
|
+
).sort_values(["Feature", "Dataset"]),
|
295
|
+
*figures,
|
296
|
+
passed,
|
297
|
+
)
|
@@ -5,13 +5,16 @@
|
|
5
5
|
from statsmodels.tsa.arima.model import ARIMA
|
6
6
|
from statsmodels.tsa.stattools import adfuller
|
7
7
|
|
8
|
+
from validmind import tags, tasks
|
8
9
|
from validmind.logging import get_logger
|
9
|
-
from validmind.vm_models import
|
10
|
+
from validmind.vm_models import VMDataset, VMModel
|
10
11
|
|
11
12
|
logger = get_logger(__name__)
|
12
13
|
|
13
14
|
|
14
|
-
|
15
|
+
@tags("time_series_data", "forecasting", "model_selection", "statsmodels")
|
16
|
+
@tasks("regression")
|
17
|
+
def AutoARIMA(model: VMModel, dataset: VMDataset):
|
15
18
|
"""
|
16
19
|
Evaluates ARIMA models for time-series forecasting, ranking them using Bayesian and Akaike Information Criteria.
|
17
20
|
|
@@ -61,56 +64,48 @@ class AutoARIMA(Metric):
|
|
61
64
|
- The test is only applicable to regression tasks involving time-series data, and may not work effectively for
|
62
65
|
other types of machine learning tasks.
|
63
66
|
"""
|
64
|
-
|
65
|
-
name = "auto_arima"
|
66
|
-
required_inputs = ["dataset"]
|
67
|
-
tasks = ["regression"]
|
68
|
-
tags = ["time_series_data", "forecasting", "model_selection", "statsmodels"]
|
69
|
-
|
70
67
|
max_p = 3
|
71
68
|
max_d = 2
|
72
69
|
max_q = 3
|
73
70
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
result = {
|
71
|
+
df = dataset.x_df()
|
72
|
+
|
73
|
+
table = []
|
74
|
+
|
75
|
+
for col in df.columns:
|
76
|
+
series = df[col].dropna()
|
77
|
+
|
78
|
+
# Check for stationarity using the Augmented Dickey-Fuller test
|
79
|
+
adf_test = adfuller(series)
|
80
|
+
if adf_test[1] > 0.05:
|
81
|
+
logger.warning(f"{col} is not stationary. Results may be inaccurate.")
|
82
|
+
|
83
|
+
arima_orders = []
|
84
|
+
bic_values = []
|
85
|
+
aic_values = []
|
86
|
+
|
87
|
+
for p in range(max_p + 1):
|
88
|
+
for d in range(max_d + 1):
|
89
|
+
for q in range(max_q + 1):
|
90
|
+
try:
|
91
|
+
model = ARIMA(series, order=(p, d, q))
|
92
|
+
model_fit = model.fit()
|
93
|
+
|
94
|
+
arima_orders.append((p, d, q))
|
95
|
+
bic_values.append(model_fit.bic)
|
96
|
+
aic_values.append(model_fit.aic)
|
97
|
+
except Exception as e:
|
98
|
+
logger.error(
|
99
|
+
f"Error fitting ARIMA({p}, {d}, {q}) model for {col}: {e}"
|
100
|
+
)
|
101
|
+
|
102
|
+
table.append(
|
103
|
+
{
|
109
104
|
"Variable": col,
|
110
105
|
"ARIMA Orders": arima_orders,
|
111
106
|
"BIC": bic_values,
|
112
107
|
"AIC": aic_values,
|
113
108
|
}
|
114
|
-
|
109
|
+
)
|
115
110
|
|
116
|
-
|
111
|
+
return table
|