validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.24.dist-info/METADATA +0 -118
  196. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,27 +2,165 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from functools import partial
7
- from typing import List
5
+ from typing import Callable, Dict, List, Tuple, Union
8
6
 
9
7
  import matplotlib.pyplot as plt
10
8
  import pandas as pd
11
9
  import seaborn as sns
12
10
  from sklearn import metrics
13
11
 
14
- from validmind.vm_models import (
15
- Figure,
16
- ResultSummary,
17
- ResultTable,
18
- ResultTableMetadata,
19
- ThresholdTest,
20
- ThresholdTestResult,
21
- )
12
+ from validmind.tests import tags, tasks
13
+ from validmind.vm_models import VMDataset, VMModel
14
+
15
+ DEFAULT_METRICS = {
16
+ "accuracy": metrics.accuracy_score,
17
+ "precision": metrics.precision_score,
18
+ "recall": metrics.recall_score,
19
+ "f1": metrics.f1_score,
20
+ }
21
+ DEFAULT_THRESHOLDS = {
22
+ "accuracy": 0.75,
23
+ "precision": 0.5,
24
+ "recall": 0.5,
25
+ "f1": 0.7,
26
+ }
27
+
28
+
29
+ def _compute_metrics(
30
+ results: dict,
31
+ metrics: Dict[str, Callable],
32
+ region: str,
33
+ df_region: pd.DataFrame,
34
+ target_column: str,
35
+ prediction_column: str,
36
+ feature_column: str,
37
+ ) -> None:
38
+ """
39
+ Computes and appends the default metrics for a given DataFrame slice to the results dictionary.
40
+ Args:
41
+ results (dict): A dictionary to which the computed metrics will be appended.
42
+ region (str): A string identifier for the DataFrame slice being evaluated.
43
+ df_region (pd.DataFrame): A pandas DataFrame slice containing the data to evaluate.
44
+ target_column (str): The name of the target column to use for computing the metrics.
45
+ prediction_column (str): The name of the prediction column to use for computing the metrics.
46
+ Returns:
47
+ None: The computed metrics are appended to the `results` dictionary in-place.
48
+ """
49
+ results["Slice"].append(str(region))
50
+ results["Shape"].append(df_region.shape[0])
51
+ results["Feature"].append(feature_column)
52
+
53
+ # Check if df_region is an empty dataframe and if so, append 0 to all metrics
54
+ if df_region.empty:
55
+ for metric in metrics.keys():
56
+ results[metric].append(0)
57
+ return
22
58
 
59
+ y_true = df_region[target_column].values
60
+ y_prediction = (
61
+ df_region[prediction_column].astype(df_region[target_column].dtypes).values
62
+ )
23
63
 
24
- @dataclass
25
- class WeakspotsDiagnosis(ThresholdTest):
64
+ for metric, metric_fn in metrics.items():
65
+ results[metric].append(metric_fn(y_true, y_prediction))
66
+
67
+
68
+ def _plot_weak_spots(
69
+ results_1: dict, results_2: dict, feature_column: str, metric: str, threshold: float
70
+ ) -> Tuple[plt.Figure, pd.DataFrame]:
71
+ """
72
+ Plots the metric of the training and test datasets for each region in a given feature column,
73
+ and highlights regions where the score is below a specified threshold.
74
+ Args:
75
+ results_1 (list of dict): The results of the model on the training dataset, as a list of dictionaries.
76
+ results_2 (list of dict): The results of the model on the test dataset, as a list of dictionaries.
77
+ feature_column (str): The name of the feature column being analyzed.
78
+ metric (str): The name of the metric to plot.
79
+ threshold (float): The minimum accuracy threshold to be highlighted on the plot.
80
+ Returns:
81
+ fig (matplotlib.figure.Figure): The figure object containing the plot.
82
+ df (pandas.DataFrame): The concatenated dataframe containing the training and test results.
83
+ """
84
+ # Concat training and test datasets
85
+ results_1 = pd.DataFrame(results_1)
86
+ results_2 = pd.DataFrame(results_2)
87
+ dataset_type_column = "Dataset Type"
88
+ results_1[dataset_type_column] = "Training"
89
+ results_2[dataset_type_column] = "Test"
90
+ df = pd.concat([results_1, results_2])
91
+
92
+ # Create a bar plot using seaborn library
93
+ fig, ax = plt.subplots()
94
+ barplot = sns.barplot(
95
+ data=df,
96
+ x="Slice",
97
+ y=metric,
98
+ hue=dataset_type_column,
99
+ edgecolor="black",
100
+ ax=ax,
101
+ )
102
+ ax.tick_params(axis="x", rotation=90)
103
+ for p in ax.patches:
104
+ t = ax.annotate(
105
+ str("{:.2f}%".format(p.get_height())),
106
+ xy=(p.get_x() + 0.03, p.get_height() + 1),
107
+ )
108
+ t.set(color="black", size=14)
109
+
110
+ axhline = ax.axhline(
111
+ y=threshold,
112
+ color="red",
113
+ linestyle="--",
114
+ linewidth=3,
115
+ label=f"Threshold: {threshold}",
116
+ )
117
+ ax.set_ylabel(metric.capitalize(), weight="bold", fontsize=18)
118
+ ax.set_xlabel("Slice/Segments", weight="bold", fontsize=18)
119
+ ax.set_title(
120
+ f"Weak regions in feature column: {feature_column}",
121
+ weight="bold",
122
+ fontsize=20,
123
+ wrap=True,
124
+ )
125
+
126
+ # Get the legend handles and labels from the barplot
127
+ handles, labels = barplot.get_legend_handles_labels()
128
+
129
+ # Append the axhline handle and label
130
+ handles.append(axhline)
131
+ labels.append(axhline.get_label())
132
+
133
+ # Create a legend with both hue and axhline labels, the threshold line
134
+ # will show up twice so remove the first element
135
+ # barplot.legend(handles=handles[:-1], labels=labels, loc="upper right")
136
+ barplot.legend(
137
+ handles=handles[:-1],
138
+ labels=labels[:-1],
139
+ loc="upper center",
140
+ bbox_to_anchor=(0.5, 0.1),
141
+ ncol=len(handles) - 1,
142
+ )
143
+
144
+ plt.close()
145
+
146
+ return fig, df
147
+
148
+
149
+ @tags(
150
+ "sklearn",
151
+ "binary_classification",
152
+ "multiclass_classification",
153
+ "model_diagnosis",
154
+ "visualization",
155
+ )
156
+ @tasks("classification", "text_classification")
157
+ def WeakspotsDiagnosis(
158
+ datasets: List[VMDataset],
159
+ model: VMModel,
160
+ features_columns: Union[List[str], None] = None,
161
+ metrics: Union[Dict[str, Callable], None] = None,
162
+ thresholds: Union[Dict[str, float], None] = None,
163
+ ):
26
164
  """
27
165
  Identifies and visualizes weak spots in a machine learning model's performance across various sections of the
28
166
  feature space.
@@ -72,282 +210,88 @@ class WeakspotsDiagnosis(ThresholdTest):
72
210
  - Despite its usefulness in highlighting problematic regions, the test does not offer direct suggestions for model
73
211
  improvement.
74
212
  """
213
+ feature_columns = features_columns or datasets[0].feature_columns
214
+ if not all(col in datasets[0].feature_columns for col in feature_columns):
215
+ raise ValueError(
216
+ "Column(s) provided in features_columns do not exist in the dataset"
217
+ )
75
218
 
76
- name = "weak_spots"
77
- required_inputs = ["model", "datasets"]
78
-
79
- default_params = {
80
- "features_columns": None,
81
- # Some default values that the user should override
82
- "thresholds": {
83
- "accuracy": 0.75,
84
- "precision": 0.5,
85
- "recall": 0.5,
86
- "f1": 0.7,
87
- },
88
- }
89
-
90
- tasks = ["classification", "text_classification"]
91
- tags = [
92
- "sklearn",
93
- "binary_classification",
94
- "multiclass_classification",
95
- "model_diagnosis",
96
- "visualization",
97
- ]
219
+ metrics = metrics or DEFAULT_METRICS
220
+ metrics = {k.title(): v for k, v in metrics.items()}
98
221
 
99
- # TODO: allow configuring
100
- default_metrics = {
101
- "accuracy": metrics.accuracy_score,
102
- "precision": partial(metrics.precision_score, zero_division=0),
103
- "recall": partial(metrics.recall_score, zero_division=0),
104
- "f1": partial(metrics.f1_score, zero_division=0),
105
- }
106
-
107
- def run(self):
108
- thresholds = self.params["thresholds"]
109
-
110
- # Ensure there is a threshold for each metric
111
- for metric in self.default_metrics.keys():
112
- if metric not in thresholds:
113
- raise ValueError(f"Threshold for metric {metric} is missing")
114
-
115
- if self.params["features_columns"] is None:
116
- features_list = self.inputs.datasets[0].feature_columns
117
- else:
118
- features_list = self.params["features_columns"]
119
-
120
- if self.inputs.datasets[0].text_column in features_list:
121
- raise ValueError(
122
- "Skiping Weakspots Diagnosis test for the dataset with text column"
123
- )
222
+ thresholds = thresholds or DEFAULT_THRESHOLDS
223
+ thresholds = {k.title(): v for k, v in thresholds.items()}
124
224
 
125
- # Check if all elements from features_list are present in the feature columns
126
- all_present = all(
127
- elem in self.inputs.datasets[0].feature_columns for elem in features_list
128
- )
129
- if not all_present:
130
- raise ValueError(
131
- "The list of feature columns provided do not match with "
132
- + "training dataset feature columns"
133
- )
225
+ results_headers = ["Slice", "Shape", "Feature"]
226
+ results_headers.extend(metrics.keys())
134
227
 
135
- target_column = self.inputs.datasets[0].target_column
136
- prediction_column = f"{target_column}_pred"
137
-
138
- train_df = self.inputs.datasets[0].df.copy()
139
- train_class_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
140
- train_df[prediction_column] = train_class_pred
141
-
142
- test_df = self.inputs.datasets[1].df.copy()
143
- test_class_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
144
- test_df[prediction_column] = test_class_pred
145
-
146
- test_results = []
147
- test_figures = []
148
- results_headers = ["slice", "shape", "feature"]
149
- results_headers.extend(self.default_metrics.keys())
150
- for feature in features_list:
151
- bins = 10
152
- if feature in self.inputs.datasets[0].feature_columns_categorical:
153
- bins = len(train_df[feature].unique())
154
- train_df["bin"] = pd.cut(train_df[feature], bins=bins)
155
-
156
- results_train = {k: [] for k in results_headers}
157
- results_test = {k: [] for k in results_headers}
158
-
159
- for region, df_region in train_df.groupby("bin"):
160
- self._compute_metrics(
161
- results_train,
162
- region,
163
- df_region,
164
- target_column,
165
- prediction_column,
166
- feature,
167
- )
168
- df_test_region = test_df[
169
- (test_df[feature] > region.left)
170
- & (test_df[feature] <= region.right)
171
- ]
172
- self._compute_metrics(
173
- results_test,
174
- region,
175
- df_test_region,
176
- target_column,
177
- prediction_column,
178
- feature,
179
- )
180
-
181
- # Make one plot per metric
182
- for metric in self.default_metrics.keys():
183
- fig, df = self._plot_weak_spots(
184
- results_train,
185
- results_test,
186
- feature,
187
- metric=metric,
188
- threshold=thresholds[metric],
189
- )
190
-
191
- test_figures.append(
192
- Figure(
193
- for_object=self,
194
- key=f"{self.name}:{metric}:{feature}",
195
- figure=fig,
196
- metadata={
197
- "metric": metric,
198
- "threshold": thresholds[metric],
199
- "feature": feature,
200
- },
201
- )
202
- )
203
-
204
- # For simplicity, test has failed if any of the metrics is below the threshold. We will
205
- # rely on visual assessment for this test for now.
206
- results_passed = df[df[list(thresholds.keys())].lt(thresholds).any(axis=1)]
207
- passed = results_passed.empty
208
-
209
- test_results.append(
210
- ThresholdTestResult(
211
- test_name="accuracy",
212
- column=feature,
213
- passed=passed,
214
- values={"records": df.to_dict("records")},
215
- )
216
- )
217
- return self.cache_results(
218
- test_results,
219
- passed=all([r.passed for r in test_results]),
220
- figures=test_figures,
221
- )
228
+ figures = []
229
+ passed = True
222
230
 
223
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
224
- results_table = [
225
- record for result in results for record in result.values["records"]
226
- ]
227
- return ResultSummary(
228
- results=[
229
- ResultTable(
230
- data=results_table,
231
- metadata=ResultTableMetadata(title="Weakspots Test"),
232
- )
233
- ]
234
- )
231
+ df_1 = datasets[0]._df[
232
+ feature_columns
233
+ + [datasets[0].target_column, datasets[0].prediction_column(model)]
234
+ ]
235
+ df_2 = datasets[1]._df[
236
+ feature_columns
237
+ + [datasets[1].target_column, datasets[1].prediction_column(model)]
238
+ ]
235
239
 
236
- def _compute_metrics(
237
- self,
238
- results: dict,
239
- region: str,
240
- df_region: pd.DataFrame,
241
- target_column: str,
242
- prediction_column: str,
243
- feature_column: str,
244
- ) -> None:
245
- """
246
- Computes and appends the default metrics for a given DataFrame slice to the results dictionary.
247
- Args:
248
- results (dict): A dictionary to which the computed metrics will be appended.
249
- region (str): A string identifier for the DataFrame slice being evaluated.
250
- df_region (pd.DataFrame): A pandas DataFrame slice containing the data to evaluate.
251
- target_column (str): The name of the target column to use for computing the metrics.
252
- prediction_column (str): The name of the prediction column to use for computing the metrics.
253
- Returns:
254
- None: The computed metrics are appended to the `results` dictionary in-place.
255
- """
256
- results["slice"].append(str(region))
257
- results["shape"].append(df_region.shape[0])
258
- results["feature"].append(feature_column)
259
-
260
- # Check if df_region is an empty dataframe and if so, append 0 to all metrics
261
- if df_region.empty:
262
- for metric in self.default_metrics.keys():
263
- results[metric].append(0)
264
- return
265
-
266
- y_true = df_region[target_column].values
267
- y_prediction = (
268
- df_region[prediction_column].astype(df_region[target_column].dtypes).values
269
- )
240
+ for feature in feature_columns:
241
+ bins = 10
242
+ if feature in datasets[0].feature_columns_categorical:
243
+ bins = len(df_1[feature].unique())
244
+ df_1["bin"] = pd.cut(df_1[feature], bins=bins)
245
+
246
+ results_1 = {k: [] for k in results_headers}
247
+ results_2 = {k: [] for k in results_headers}
248
+
249
+ for region, df_region in df_1.groupby("bin"):
250
+ _compute_metrics(
251
+ results=results_1,
252
+ metrics=metrics,
253
+ region=region,
254
+ df_region=df_region,
255
+ target_column=datasets[0].target_column,
256
+ prediction_column=datasets[0].prediction_column(model),
257
+ feature_column=feature,
258
+ )
259
+ df_2_region = df_2[
260
+ (df_2[feature] > region.left) & (df_2[feature] <= region.right)
261
+ ]
262
+ _compute_metrics(
263
+ results=results_2,
264
+ metrics=metrics,
265
+ region=region,
266
+ df_region=df_2_region,
267
+ target_column=datasets[1].target_column,
268
+ prediction_column=datasets[1].prediction_column(model),
269
+ feature_column=feature,
270
+ )
270
271
 
271
- for metric, metric_fn in self.default_metrics.items():
272
- results[metric].append(metric_fn(y_true, y_prediction))
273
-
274
- def _plot_weak_spots(
275
- self, results_train, results_test, feature_column, metric, threshold
276
- ):
277
- """
278
- Plots the metric of the training and test datasets for each region in a given feature column,
279
- and highlights regions where the score is below a specified threshold.
280
- Args:
281
- results_train (list of dict): The results of the model on the training dataset, as a list of dictionaries.
282
- results_test (list of dict): The results of the model on the test dataset, as a list of dictionaries.
283
- feature_column (str): The name of the feature column being analyzed.
284
- metric (str): The name of the metric to plot.
285
- threshold (float): The minimum accuracy threshold to be highlighted on the plot.
286
- Returns:
287
- fig (matplotlib.figure.Figure): The figure object containing the plot.
288
- df (pandas.DataFrame): The concatenated dataframe containing the training and test results.
289
- """
290
- # Concat training and test datasets
291
- results_train = pd.DataFrame(results_train)
292
- results_test = pd.DataFrame(results_test)
293
- dataset_type_column = "Dataset Type"
294
- results_train[dataset_type_column] = "Training"
295
- results_test[dataset_type_column] = "Test"
296
- df = pd.concat([results_train, results_test])
297
-
298
- # Create a bar plot using seaborn library
299
- fig, ax = plt.subplots()
300
- barplot = sns.barplot(
301
- data=df,
302
- x="slice",
303
- y=metric,
304
- hue=dataset_type_column,
305
- edgecolor="black",
306
- ax=ax,
307
- )
308
- ax.tick_params(axis="x", rotation=90)
309
- for p in ax.patches:
310
- t = ax.annotate(
311
- str("{:.2f}%".format(p.get_height())),
312
- xy=(p.get_x() + 0.03, p.get_height() + 1),
272
+ for metric in metrics.keys():
273
+ fig, df = _plot_weak_spots(
274
+ results_1=results_1,
275
+ results_2=results_2,
276
+ feature_column=feature,
277
+ metric=metric,
278
+ threshold=thresholds[metric],
313
279
  )
314
- t.set(color="black", size=14)
315
-
316
- axhline = ax.axhline(
317
- y=threshold,
318
- color="red",
319
- linestyle="--",
320
- linewidth=3,
321
- label=f"Threshold: {threshold}",
322
- )
323
- ax.set_ylabel(metric.capitalize(), weight="bold", fontsize=18)
324
- ax.set_xlabel("Slice/Segments", weight="bold", fontsize=18)
325
- ax.set_title(
326
- f"Weak regions in feature column: {feature_column}",
327
- weight="bold",
328
- fontsize=20,
329
- wrap=True,
330
- )
331
280
 
332
- # Get the legend handles and labels from the barplot
333
- handles, labels = barplot.get_legend_handles_labels()
334
-
335
- # Append the axhline handle and label
336
- handles.append(axhline)
337
- labels.append(axhline.get_label())
338
-
339
- # Create a legend with both hue and axhline labels, the threshold line
340
- # will show up twice so remove the first element
341
- # barplot.legend(handles=handles[:-1], labels=labels, loc="upper right")
342
- barplot.legend(
343
- handles=handles[:-1],
344
- labels=labels[:-1],
345
- loc="upper center",
346
- bbox_to_anchor=(0.5, 0.1),
347
- ncol=len(handles) - 1,
348
- )
281
+ figures.append(fig)
349
282
 
350
- # Do this if you want to prevent the figure from being displayed
351
- plt.close("all")
283
+ # For simplicity, test has failed if any of the metrics is below the threshold. We will
284
+ # rely on visual assessment for this test for now.
285
+ if not df[df[list(thresholds.keys())].lt(thresholds).any(axis=1)].empty:
286
+ passed = False
352
287
 
353
- return fig, df
288
+ return (
289
+ pd.concat(
290
+ [
291
+ pd.DataFrame(results_1).assign(Dataset=datasets[0].input_id),
292
+ pd.DataFrame(results_2).assign(Dataset=datasets[1].input_id),
293
+ ]
294
+ ).sort_values(["Feature", "Dataset"]),
295
+ *figures,
296
+ passed,
297
+ )
@@ -5,13 +5,16 @@
5
5
  from statsmodels.tsa.arima.model import ARIMA
6
6
  from statsmodels.tsa.stattools import adfuller
7
7
 
8
+ from validmind import tags, tasks
8
9
  from validmind.logging import get_logger
9
- from validmind.vm_models import Metric
10
+ from validmind.vm_models import VMDataset, VMModel
10
11
 
11
12
  logger = get_logger(__name__)
12
13
 
13
14
 
14
- class AutoARIMA(Metric):
15
+ @tags("time_series_data", "forecasting", "model_selection", "statsmodels")
16
+ @tasks("regression")
17
+ def AutoARIMA(model: VMModel, dataset: VMDataset):
15
18
  """
16
19
  Evaluates ARIMA models for time-series forecasting, ranking them using Bayesian and Akaike Information Criteria.
17
20
 
@@ -61,56 +64,48 @@ class AutoARIMA(Metric):
61
64
  - The test is only applicable to regression tasks involving time-series data, and may not work effectively for
62
65
  other types of machine learning tasks.
63
66
  """
64
-
65
- name = "auto_arima"
66
- required_inputs = ["dataset"]
67
- tasks = ["regression"]
68
- tags = ["time_series_data", "forecasting", "model_selection", "statsmodels"]
69
-
70
67
  max_p = 3
71
68
  max_d = 2
72
69
  max_q = 3
73
70
 
74
- def run(self):
75
- x_train = self.inputs.dataset.df
76
-
77
- results = []
78
-
79
- for col in x_train.columns:
80
- series = x_train[col].dropna()
81
-
82
- # Check for stationarity using the Augmented Dickey-Fuller test
83
- adf_test = adfuller(series)
84
- if adf_test[1] > 0.05:
85
- logger.warning(
86
- f"Warning: {col} is not stationary. Results may be inaccurate."
87
- )
88
-
89
- arima_orders = []
90
- bic_values = []
91
- aic_values = []
92
-
93
- for p in range(self.max_p + 1):
94
- for d in range(self.max_d + 1):
95
- for q in range(self.max_q + 1):
96
- try:
97
- model = ARIMA(series, order=(p, d, q))
98
- model_fit = model.fit()
99
-
100
- arima_orders.append((p, d, q))
101
- bic_values.append(model_fit.bic)
102
- aic_values.append(model_fit.aic)
103
- except Exception as e:
104
- logger.error(
105
- f"Error fitting ARIMA({p}, {d}, {q}) model for {col}: {e}"
106
- )
107
-
108
- result = {
71
+ df = dataset.x_df()
72
+
73
+ table = []
74
+
75
+ for col in df.columns:
76
+ series = df[col].dropna()
77
+
78
+ # Check for stationarity using the Augmented Dickey-Fuller test
79
+ adf_test = adfuller(series)
80
+ if adf_test[1] > 0.05:
81
+ logger.warning(f"{col} is not stationary. Results may be inaccurate.")
82
+
83
+ arima_orders = []
84
+ bic_values = []
85
+ aic_values = []
86
+
87
+ for p in range(max_p + 1):
88
+ for d in range(max_d + 1):
89
+ for q in range(max_q + 1):
90
+ try:
91
+ model = ARIMA(series, order=(p, d, q))
92
+ model_fit = model.fit()
93
+
94
+ arima_orders.append((p, d, q))
95
+ bic_values.append(model_fit.bic)
96
+ aic_values.append(model_fit.aic)
97
+ except Exception as e:
98
+ logger.error(
99
+ f"Error fitting ARIMA({p}, {d}, {q}) model for {col}: {e}"
100
+ )
101
+
102
+ table.append(
103
+ {
109
104
  "Variable": col,
110
105
  "ARIMA Orders": arima_orders,
111
106
  "BIC": bic_values,
112
107
  "AIC": aic_values,
113
108
  }
114
- results.append(result)
109
+ )
115
110
 
116
- return self.cache_results(results)
111
+ return table