validmind 2.7.5__py3-none-any.whl → 2.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +2 -0
- validmind/__version__.py +1 -1
- validmind/api_client.py +8 -1
- validmind/datasets/credit_risk/lending_club.py +352 -87
- validmind/html_templates/content_blocks.py +1 -1
- validmind/tests/__types__.py +17 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +6 -2
- validmind/tests/data_validation/AutoMA.py +2 -2
- validmind/tests/data_validation/BivariateScatterPlots.py +4 -2
- validmind/tests/data_validation/BoxPierce.py +2 -2
- validmind/tests/data_validation/ClassImbalance.py +2 -1
- validmind/tests/data_validation/DatasetDescription.py +11 -2
- validmind/tests/data_validation/DatasetSplit.py +2 -2
- validmind/tests/data_validation/DickeyFullerGLS.py +2 -2
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +8 -2
- validmind/tests/data_validation/HighCardinality.py +9 -2
- validmind/tests/data_validation/HighPearsonCorrelation.py +18 -4
- validmind/tests/data_validation/IQROutliersBarPlot.py +9 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +2 -2
- validmind/tests/data_validation/MissingValuesBarPlot.py +12 -9
- validmind/tests/data_validation/MutualInformation.py +6 -8
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +2 -2
- validmind/tests/data_validation/ProtectedClassesCombination.py +6 -1
- validmind/tests/data_validation/ProtectedClassesDescription.py +1 -1
- validmind/tests/data_validation/ProtectedClassesDisparity.py +4 -5
- validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +1 -4
- validmind/tests/data_validation/RollingStatsPlot.py +21 -10
- validmind/tests/data_validation/ScatterPlot.py +3 -5
- validmind/tests/data_validation/ScoreBandDefaultRates.py +2 -1
- validmind/tests/data_validation/SeasonalDecompose.py +12 -2
- validmind/tests/data_validation/Skewness.py +6 -3
- validmind/tests/data_validation/SpreadPlot.py +8 -3
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +4 -2
- validmind/tests/data_validation/TabularDateTimeHistograms.py +2 -2
- validmind/tests/data_validation/TargetRateBarPlots.py +4 -3
- validmind/tests/data_validation/TimeSeriesFrequency.py +7 -2
- validmind/tests/data_validation/TimeSeriesMissingValues.py +14 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +1 -5
- validmind/tests/data_validation/WOEBinPlots.py +2 -2
- validmind/tests/data_validation/WOEBinTable.py +11 -9
- validmind/tests/data_validation/nlp/CommonWords.py +2 -2
- validmind/tests/data_validation/nlp/Hashtags.py +2 -2
- validmind/tests/data_validation/nlp/LanguageDetection.py +9 -6
- validmind/tests/data_validation/nlp/Mentions.py +9 -6
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -2
- validmind/tests/data_validation/nlp/Punctuations.py +4 -2
- validmind/tests/data_validation/nlp/Sentiment.py +2 -2
- validmind/tests/data_validation/nlp/StopWords.py +5 -4
- validmind/tests/data_validation/nlp/TextDescription.py +2 -2
- validmind/tests/data_validation/nlp/Toxicity.py +2 -2
- validmind/tests/model_validation/BertScore.py +2 -2
- validmind/tests/model_validation/BleuScore.py +2 -2
- validmind/tests/model_validation/ClusterSizeDistribution.py +2 -2
- validmind/tests/model_validation/ContextualRecall.py +2 -2
- validmind/tests/model_validation/FeaturesAUC.py +2 -2
- validmind/tests/model_validation/MeteorScore.py +2 -2
- validmind/tests/model_validation/ModelPredictionResiduals.py +2 -2
- validmind/tests/model_validation/RegardScore.py +6 -2
- validmind/tests/model_validation/RegressionResidualsPlot.py +4 -3
- validmind/tests/model_validation/RougeScore.py +6 -5
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +11 -2
- validmind/tests/model_validation/TokenDisparity.py +2 -2
- validmind/tests/model_validation/ToxicityScore.py +10 -2
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +9 -3
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +16 -2
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +5 -3
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +2 -2
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +14 -4
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +2 -2
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +16 -2
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +2 -2
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -5
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +4 -2
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +4 -2
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +4 -2
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +4 -2
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +8 -6
- validmind/tests/model_validation/embeddings/utils.py +11 -1
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +2 -1
- validmind/tests/model_validation/ragas/AspectCritic.py +11 -7
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +2 -1
- validmind/tests/model_validation/ragas/ContextPrecision.py +2 -1
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +2 -1
- validmind/tests/model_validation/ragas/ContextRecall.py +2 -1
- validmind/tests/model_validation/ragas/Faithfulness.py +2 -1
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +2 -1
- validmind/tests/model_validation/ragas/ResponseRelevancy.py +2 -1
- validmind/tests/model_validation/ragas/SemanticSimilarity.py +2 -1
- validmind/tests/model_validation/sklearn/CalibrationCurve.py +3 -2
- validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +2 -5
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +5 -2
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +2 -2
- validmind/tests/model_validation/sklearn/FeatureImportance.py +1 -14
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +6 -3
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +2 -2
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +8 -4
- validmind/tests/model_validation/sklearn/ModelParameters.py +1 -0
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -3
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +2 -2
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +20 -16
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +4 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +7 -9
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +1 -3
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +2 -1
- validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +2 -1
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +5 -3
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +9 -1
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +1 -1
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +11 -4
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -3
- validmind/tests/model_validation/statsmodels/GINITable.py +7 -15
- validmind/tests/model_validation/statsmodels/Lilliefors.py +2 -2
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +2 -2
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +5 -2
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +5 -2
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +7 -7
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +2 -2
- validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py +220 -0
- validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py +155 -0
- validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py +146 -0
- validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py +148 -0
- validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py +193 -0
- validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py +178 -0
- validmind/tests/ongoing_monitoring/FeatureDrift.py +120 -120
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +18 -23
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +86 -44
- validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +204 -0
- validmind/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.py +98 -0
- validmind/tests/ongoing_monitoring/ROCCurveDrift.py +150 -0
- validmind/tests/ongoing_monitoring/ScoreBandsDrift.py +212 -0
- validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py +209 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +91 -13
- validmind/tests/prompt_validation/Bias.py +13 -9
- validmind/tests/prompt_validation/Clarity.py +13 -9
- validmind/tests/prompt_validation/Conciseness.py +13 -9
- validmind/tests/prompt_validation/Delimitation.py +13 -9
- validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
- validmind/tests/prompt_validation/Robustness.py +6 -2
- validmind/tests/prompt_validation/Specificity.py +13 -9
- validmind/tests/run.py +6 -0
- validmind/utils.py +7 -8
- validmind/vm_models/dataset/dataset.py +0 -4
- {validmind-2.7.5.dist-info → validmind-2.7.7.dist-info}/METADATA +2 -3
- {validmind-2.7.5.dist-info → validmind-2.7.7.dist-info}/RECORD +149 -138
- {validmind-2.7.5.dist-info → validmind-2.7.7.dist-info}/WHEEL +1 -1
- {validmind-2.7.5.dist-info → validmind-2.7.7.dist-info}/LICENSE +0 -0
- {validmind-2.7.5.dist-info → validmind-2.7.7.dist-info}/entry_points.txt +0 -0
validmind/tests/__types__.py
CHANGED
@@ -39,6 +39,7 @@ TestID = Union[
|
|
39
39
|
"validmind.data_validation.LaggedCorrelationHeatmap",
|
40
40
|
"validmind.data_validation.MissingValues",
|
41
41
|
"validmind.data_validation.MissingValuesBarPlot",
|
42
|
+
"validmind.data_validation.MutualInformation",
|
42
43
|
"validmind.data_validation.PearsonCorrelationMatrix",
|
43
44
|
"validmind.data_validation.PhillipsPerronArch",
|
44
45
|
"validmind.data_validation.ProtectedClassesCombination",
|
@@ -48,6 +49,7 @@ TestID = Union[
|
|
48
49
|
"validmind.data_validation.RollingStatsPlot",
|
49
50
|
"validmind.data_validation.RunsTest",
|
50
51
|
"validmind.data_validation.ScatterPlot",
|
52
|
+
"validmind.data_validation.ScoreBandDefaultRates",
|
51
53
|
"validmind.data_validation.SeasonalDecompose",
|
52
54
|
"validmind.data_validation.ShapiroWilk",
|
53
55
|
"validmind.data_validation.Skewness",
|
@@ -121,7 +123,9 @@ TestID = Union[
|
|
121
123
|
"validmind.model_validation.ragas.SemanticSimilarity",
|
122
124
|
"validmind.model_validation.sklearn.AdjustedMutualInformation",
|
123
125
|
"validmind.model_validation.sklearn.AdjustedRandIndex",
|
126
|
+
"validmind.model_validation.sklearn.CalibrationCurve",
|
124
127
|
"validmind.model_validation.sklearn.ClassifierPerformance",
|
128
|
+
"validmind.model_validation.sklearn.ClassifierThresholdOptimization",
|
125
129
|
"validmind.model_validation.sklearn.ClusterCosineSimilarity",
|
126
130
|
"validmind.model_validation.sklearn.ClusterPerformanceMetrics",
|
127
131
|
"validmind.model_validation.sklearn.CompletenessScore",
|
@@ -134,6 +138,7 @@ TestID = Union[
|
|
134
138
|
"validmind.model_validation.sklearn.MinimumAccuracy",
|
135
139
|
"validmind.model_validation.sklearn.MinimumF1Score",
|
136
140
|
"validmind.model_validation.sklearn.MinimumROCAUCScore",
|
141
|
+
"validmind.model_validation.sklearn.ModelParameters",
|
137
142
|
"validmind.model_validation.sklearn.ModelsPerformanceComparison",
|
138
143
|
"validmind.model_validation.sklearn.OverfitDiagnosis",
|
139
144
|
"validmind.model_validation.sklearn.PermutationFeatureImportance",
|
@@ -147,6 +152,7 @@ TestID = Union[
|
|
147
152
|
"validmind.model_validation.sklearn.RegressionR2SquareComparison",
|
148
153
|
"validmind.model_validation.sklearn.RobustnessDiagnosis",
|
149
154
|
"validmind.model_validation.sklearn.SHAPGlobalImportance",
|
155
|
+
"validmind.model_validation.sklearn.ScoreProbabilityAlignment",
|
150
156
|
"validmind.model_validation.sklearn.SilhouettePlot",
|
151
157
|
"validmind.model_validation.sklearn.TrainingTestDegradation",
|
152
158
|
"validmind.model_validation.sklearn.VMeasure",
|
@@ -166,9 +172,20 @@ TestID = Union[
|
|
166
172
|
"validmind.model_validation.statsmodels.RegressionModelSummary",
|
167
173
|
"validmind.model_validation.statsmodels.RegressionPermutationFeatureImportance",
|
168
174
|
"validmind.model_validation.statsmodels.ScorecardHistogram",
|
175
|
+
"validmind.ongoing_monitoring.CalibrationCurveDrift",
|
176
|
+
"validmind.ongoing_monitoring.ClassDiscriminationDrift",
|
177
|
+
"validmind.ongoing_monitoring.ClassImbalanceDrift",
|
178
|
+
"validmind.ongoing_monitoring.ClassificationAccuracyDrift",
|
179
|
+
"validmind.ongoing_monitoring.ConfusionMatrixDrift",
|
180
|
+
"validmind.ongoing_monitoring.CumulativePredictionProbabilitiesDrift",
|
169
181
|
"validmind.ongoing_monitoring.FeatureDrift",
|
170
182
|
"validmind.ongoing_monitoring.PredictionAcrossEachFeature",
|
171
183
|
"validmind.ongoing_monitoring.PredictionCorrelation",
|
184
|
+
"validmind.ongoing_monitoring.PredictionProbabilitiesHistogramDrift",
|
185
|
+
"validmind.ongoing_monitoring.PredictionQuantilesAcrossFeatures",
|
186
|
+
"validmind.ongoing_monitoring.ROCCurveDrift",
|
187
|
+
"validmind.ongoing_monitoring.ScoreBandsDrift",
|
188
|
+
"validmind.ongoing_monitoring.ScorecardHistogramDrift",
|
172
189
|
"validmind.ongoing_monitoring.TargetPredictionDistributionPlot",
|
173
190
|
"validmind.prompt_validation.Bias",
|
174
191
|
"validmind.prompt_validation.Clarity",
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
6
6
|
import plotly.graph_objects as go
|
7
7
|
from statsmodels.tsa.stattools import acf, pacf
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.vm_models import VMDataset
|
11
11
|
|
12
12
|
|
@@ -62,6 +62,8 @@ def ACFandPACFPlot(dataset: VMDataset):
|
|
62
62
|
raise ValueError("Provided 'columns' must exist in the dataset")
|
63
63
|
|
64
64
|
figures = []
|
65
|
+
acf_store = {}
|
66
|
+
pacf_store = {}
|
65
67
|
for col in df.columns:
|
66
68
|
series = df[col]
|
67
69
|
max_lags = min(40, len(series) // 2 - 1)
|
@@ -77,6 +79,7 @@ def ACFandPACFPlot(dataset: VMDataset):
|
|
77
79
|
font=dict(size=18),
|
78
80
|
)
|
79
81
|
figures.append(acf_fig)
|
82
|
+
acf_store[col] = acf_values
|
80
83
|
|
81
84
|
# Create PACF plot using Plotly
|
82
85
|
pacf_values = pacf(series, nlags=max_lags)
|
@@ -89,5 +92,6 @@ def ACFandPACFPlot(dataset: VMDataset):
|
|
89
92
|
font=dict(size=18),
|
90
93
|
)
|
91
94
|
figures.append(pacf_fig)
|
95
|
+
pacf_store[col] = pacf_values
|
92
96
|
|
93
|
-
return
|
97
|
+
return (*figures, RawData(acf_values=acf_store, pacf_values=pacf_store))
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
6
6
|
from statsmodels.tsa.arima.model import ARIMA
|
7
7
|
from statsmodels.tsa.stattools import adfuller
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.logging import get_logger
|
11
11
|
from validmind.vm_models import VMDataset
|
12
12
|
|
@@ -116,4 +116,4 @@ def AutoMA(dataset: VMDataset, max_ma_order: int = 3):
|
|
116
116
|
return {
|
117
117
|
"Auto MA Analysis Results": summary_ma_analysis,
|
118
118
|
"Best MA Order Results": best_ma_order,
|
119
|
-
}
|
119
|
+
}, RawData(raw_series_data=df)
|
@@ -6,7 +6,7 @@ import itertools
|
|
6
6
|
|
7
7
|
import plotly.express as px
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
|
11
11
|
|
12
12
|
@tags("tabular_data", "numerical_data", "visualization")
|
@@ -79,4 +79,6 @@ def BivariateScatterPlots(dataset):
|
|
79
79
|
|
80
80
|
figures.append(fig)
|
81
81
|
|
82
|
-
return tuple(figures)
|
82
|
+
return tuple(figures) + (
|
83
|
+
RawData(selected_numerical_df=df, feature_pairs=features_pairs),
|
84
|
+
)
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import pandas as pd
|
6
6
|
from statsmodels.stats.diagnostic import acorr_ljungbox
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
|
10
10
|
|
11
11
|
@tasks("regression")
|
@@ -68,4 +68,4 @@ def BoxPierce(dataset):
|
|
68
68
|
box_pierce_df.reset_index(inplace=True)
|
69
69
|
box_pierce_df.columns = ["column", "stat", "pvalue"]
|
70
70
|
|
71
|
-
return box_pierce_df
|
71
|
+
return box_pierce_df, RawData(box_pierce_values=box_pierce_values)
|
@@ -9,7 +9,7 @@ from typing import Any, Dict, Tuple
|
|
9
9
|
|
10
10
|
import plotly.graph_objs as go
|
11
11
|
|
12
|
-
from validmind import tags, tasks
|
12
|
+
from validmind import RawData, tags, tasks
|
13
13
|
from validmind.errors import SkipTestError
|
14
14
|
from validmind.vm_models import VMDataset
|
15
15
|
|
@@ -104,4 +104,5 @@ def ClassImbalance(
|
|
104
104
|
},
|
105
105
|
go.Figure(data=[trace], layout=layout),
|
106
106
|
all(row["Pass/Fail"] == "Pass" for row in imbalanced_classes),
|
107
|
+
RawData(imbalance_percentages=imbalance_percentages),
|
107
108
|
)
|
@@ -9,7 +9,7 @@ import numpy as np
|
|
9
9
|
from ydata_profiling.config import Settings
|
10
10
|
from ydata_profiling.model.typeset import ProfilingTypeSet
|
11
11
|
|
12
|
-
from validmind import tags, tasks
|
12
|
+
from validmind import RawData, tags, tasks
|
13
13
|
from validmind.errors import UnsupportedColumnTypeError
|
14
14
|
from validmind.logging import get_logger
|
15
15
|
from validmind.vm_models import VMDataset
|
@@ -220,6 +220,15 @@ def DatasetDescription(dataset: VMDataset):
|
|
220
220
|
for column in infer_datatypes(df):
|
221
221
|
results.append(describe_column(df, column))
|
222
222
|
|
223
|
+
raw_data = {
|
224
|
+
column["id"]: {
|
225
|
+
"type": column["type"],
|
226
|
+
"statistics": column["statistics"],
|
227
|
+
"histograms": column["histograms"],
|
228
|
+
}
|
229
|
+
for column in results
|
230
|
+
}
|
231
|
+
|
223
232
|
return {
|
224
233
|
"Dataset Description": [
|
225
234
|
{
|
@@ -233,4 +242,4 @@ def DatasetDescription(dataset: VMDataset):
|
|
233
242
|
}
|
234
243
|
for column in results
|
235
244
|
]
|
236
|
-
}
|
245
|
+
}, RawData(raw_data=raw_data)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
from typing import List
|
6
6
|
|
7
|
-
from validmind import tags, tasks
|
7
|
+
from validmind import RawData, tags, tasks
|
8
8
|
from validmind.vm_models import VMDataset
|
9
9
|
|
10
10
|
DATASET_LABELS = {
|
@@ -98,4 +98,4 @@ def DatasetSplit(datasets: List[VMDataset]):
|
|
98
98
|
}
|
99
99
|
)
|
100
100
|
|
101
|
-
return table
|
101
|
+
return table, RawData(dataset_results=results)
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
6
6
|
from arch.unitroot import DFGLS
|
7
7
|
from numpy.linalg import LinAlgError
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.errors import SkipTestError
|
11
11
|
from validmind.logging import get_logger
|
12
12
|
from validmind.vm_models import VMDataset
|
@@ -97,4 +97,4 @@ def DickeyFullerGLS(dataset: VMDataset):
|
|
97
97
|
|
98
98
|
return {
|
99
99
|
"DFGLS Test Results": dfgls_values,
|
100
|
-
}
|
100
|
+
}, RawData(df=df)
|
@@ -6,7 +6,7 @@
|
|
6
6
|
import numpy as np
|
7
7
|
import plotly.graph_objects as go
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
|
11
11
|
|
12
12
|
@tags("tabular_data", "visualization", "correlation")
|
@@ -58,7 +58,13 @@ def FeatureTargetCorrelationPlot(dataset, fig_height=600):
|
|
58
58
|
|
59
59
|
fig = _visualize_feature_target_correlation(df, dataset.target_column, fig_height)
|
60
60
|
|
61
|
-
|
61
|
+
correlations = (
|
62
|
+
df.corr(numeric_only=True)[dataset.target_column]
|
63
|
+
.drop(dataset.target_column)
|
64
|
+
.to_frame()
|
65
|
+
)
|
66
|
+
|
67
|
+
return fig, RawData(correlation_data=correlations)
|
62
68
|
|
63
69
|
|
64
70
|
def _visualize_feature_target_correlation(df, target_column, fig_height):
|
@@ -2,7 +2,7 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from validmind import tags, tasks
|
5
|
+
from validmind import RawData, tags, tasks
|
6
6
|
from validmind.vm_models import VMDataset
|
7
7
|
|
8
8
|
|
@@ -59,6 +59,8 @@ def HighCardinality(
|
|
59
59
|
table = []
|
60
60
|
all_passed = True
|
61
61
|
|
62
|
+
raw_data = {}
|
63
|
+
|
62
64
|
for col in dataset.feature_columns_categorical:
|
63
65
|
n_distinct = df[col].nunique()
|
64
66
|
p_distinct = n_distinct / df.shape[0]
|
@@ -73,7 +75,12 @@ def HighCardinality(
|
|
73
75
|
}
|
74
76
|
)
|
75
77
|
|
78
|
+
raw_data[col] = {
|
79
|
+
"n_distinct": n_distinct,
|
80
|
+
"p_distinct": p_distinct,
|
81
|
+
}
|
82
|
+
|
76
83
|
if not passed:
|
77
84
|
all_passed = False
|
78
85
|
|
79
|
-
return table, all_passed
|
86
|
+
return table, all_passed, RawData(raw_cardinality_details=raw_data)
|
@@ -2,14 +2,17 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from validmind import tags, tasks
|
5
|
+
from validmind import RawData, tags, tasks
|
6
6
|
from validmind.vm_models import VMDataset
|
7
7
|
|
8
8
|
|
9
9
|
@tags("tabular_data", "data_quality", "correlation")
|
10
10
|
@tasks("classification", "regression")
|
11
11
|
def HighPearsonCorrelation(
|
12
|
-
dataset: VMDataset,
|
12
|
+
dataset: VMDataset,
|
13
|
+
max_threshold: float = 0.3,
|
14
|
+
top_n_correlations: int = 10,
|
15
|
+
feature_columns: list = None,
|
13
16
|
):
|
14
17
|
"""
|
15
18
|
Identifies highly correlated feature pairs in a dataset suggesting feature redundancy or multicollinearity.
|
@@ -51,8 +54,15 @@ def HighPearsonCorrelation(
|
|
51
54
|
- Limited to identifying redundancy only within feature pairs; may fail to spot more complex relationships among
|
52
55
|
three or more variables.
|
53
56
|
"""
|
57
|
+
|
58
|
+
# Select features
|
59
|
+
if feature_columns is None:
|
60
|
+
df = dataset.df
|
61
|
+
else:
|
62
|
+
df = dataset.df[feature_columns]
|
63
|
+
|
54
64
|
# Get correlation matrix for numeric columns
|
55
|
-
corr =
|
65
|
+
corr = df.corr(numeric_only=True)
|
56
66
|
|
57
67
|
# Create table of correlation coefficients and column pairs
|
58
68
|
pairs = []
|
@@ -71,4 +81,8 @@ def HighPearsonCorrelation(
|
|
71
81
|
pairs.sort(key=lambda x: abs(x["Coefficient"]), reverse=True)
|
72
82
|
pairs = pairs[:top_n_correlations]
|
73
83
|
|
74
|
-
return
|
84
|
+
return (
|
85
|
+
pairs,
|
86
|
+
all(p["Pass/Fail"] == "Pass" for p in pairs),
|
87
|
+
RawData(correlation_matrix=corr),
|
88
|
+
)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
import plotly.graph_objects as go
|
6
6
|
|
7
|
-
from validmind import tags, tasks
|
7
|
+
from validmind import RawData, tags, tasks
|
8
8
|
from validmind.vm_models import VMDataset
|
9
9
|
|
10
10
|
|
@@ -118,4 +118,11 @@ def IQROutliersBarPlot(
|
|
118
118
|
)
|
119
119
|
figures.append(fig)
|
120
120
|
|
121
|
-
return
|
121
|
+
return (
|
122
|
+
*figures,
|
123
|
+
RawData(
|
124
|
+
outlier_counts_by_feature=df[dataset.feature_columns_numeric].apply(
|
125
|
+
lambda col: compute_outliers(col, threshold)
|
126
|
+
)
|
127
|
+
),
|
128
|
+
)
|
@@ -6,7 +6,7 @@ import numpy as np
|
|
6
6
|
import pandas as pd
|
7
7
|
import plotly.figure_factory as ff
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.vm_models import VMDataset
|
11
11
|
|
12
12
|
# Define the 'coolwarm' color scale manually
|
@@ -101,4 +101,4 @@ def LaggedCorrelationHeatmap(dataset: VMDataset, num_lags: int = 10):
|
|
101
101
|
xaxis_title="Lags",
|
102
102
|
)
|
103
103
|
|
104
|
-
return fig
|
104
|
+
return fig, RawData(correlation_matrix=correlation_df)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
import plotly.graph_objects as go
|
6
6
|
|
7
|
-
from validmind import tags, tasks
|
7
|
+
from validmind import RawData, tags, tasks
|
8
8
|
from validmind.vm_models import VMDataset
|
9
9
|
|
10
10
|
|
@@ -106,13 +106,16 @@ def MissingValuesBarPlot(
|
|
106
106
|
line=dict(color="red", dash="dash"),
|
107
107
|
)
|
108
108
|
|
109
|
-
return
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
109
|
+
return (
|
110
|
+
go.Figure(
|
111
|
+
data=[trace_below_threshold, trace_above_threshold, threshold_line],
|
112
|
+
layout=go.Layout(
|
113
|
+
title="Missing Values",
|
114
|
+
yaxis=dict(title="Columns"),
|
115
|
+
xaxis=dict(title="Missing Value Percentage (%)", range=[0, 100]),
|
116
|
+
barmode="stack",
|
117
|
+
height=fig_height,
|
118
|
+
),
|
117
119
|
),
|
120
|
+
RawData(missing_percentages=missing_percentages_sorted),
|
118
121
|
)
|
@@ -4,6 +4,7 @@
|
|
4
4
|
|
5
5
|
import plotly.graph_objects as go
|
6
6
|
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
|
7
|
+
|
7
8
|
from validmind import tags, tasks
|
8
9
|
from validmind.vm_models import VMDataset
|
9
10
|
from validmind.vm_models.result import RawData
|
@@ -76,13 +77,6 @@ def MutualInformation(
|
|
76
77
|
else:
|
77
78
|
mi_scores = mutual_info_regression(X, y)
|
78
79
|
|
79
|
-
# Create DataFrame for raw data
|
80
|
-
raw_data = RawData(
|
81
|
-
feature=dataset.feature_columns,
|
82
|
-
mutual_information_score=mi_scores.tolist(),
|
83
|
-
pass_fail=["Pass" if score >= min_threshold else "Fail" for score in mi_scores],
|
84
|
-
)
|
85
|
-
|
86
80
|
# Create Plotly figure
|
87
81
|
fig = go.Figure()
|
88
82
|
|
@@ -126,4 +120,8 @@ def MutualInformation(
|
|
126
120
|
template="plotly_white",
|
127
121
|
)
|
128
122
|
|
129
|
-
return
|
123
|
+
return fig, RawData(
|
124
|
+
mutual_information_scores={
|
125
|
+
feature: score for feature, score in zip(sorted_features, sorted_scores)
|
126
|
+
}
|
127
|
+
)
|
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
import plotly.graph_objects as go
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
|
10
10
|
|
11
11
|
@tags("tabular_data", "numerical_data", "correlation")
|
@@ -88,4 +88,4 @@ def PearsonCorrelationMatrix(dataset):
|
|
88
88
|
|
89
89
|
fig = go.Figure(data=[heatmap], layout=layout)
|
90
90
|
|
91
|
-
return fig
|
91
|
+
return fig, RawData(correlation_matrix=corr_matrix)
|
@@ -8,7 +8,7 @@ import pandas as pd
|
|
8
8
|
import plotly.graph_objects as go
|
9
9
|
import plotly.subplots as sp
|
10
10
|
|
11
|
-
from validmind import tags, tasks
|
11
|
+
from validmind import RawData, tags, tasks
|
12
12
|
from validmind.errors import MissingDependencyError
|
13
13
|
from validmind.logging import get_logger
|
14
14
|
|
@@ -202,4 +202,9 @@ def ProtectedClassesCombination(dataset, model, protected_classes=None):
|
|
202
202
|
{"Class Combination Table": metrics_by_group},
|
203
203
|
{"DPR and EOR table": dpr_eor_df},
|
204
204
|
fig,
|
205
|
+
RawData(
|
206
|
+
metrics_frame=mf,
|
207
|
+
demographic_parity_ratios=m_dpr,
|
208
|
+
equalized_odds_ratios=m_eqo,
|
209
|
+
),
|
205
210
|
)
|
@@ -119,7 +119,7 @@ def ProtectedClassesDisparity(
|
|
119
119
|
mask_significance=True,
|
120
120
|
)
|
121
121
|
|
122
|
-
|
122
|
+
figures = []
|
123
123
|
for protected_class in protected_classes:
|
124
124
|
plot = ap.disparity(
|
125
125
|
bdf, metrics, protected_class, fairness_threshold=disparity_tolerance
|
@@ -129,13 +129,12 @@ def ProtectedClassesDisparity(
|
|
129
129
|
plot.save(
|
130
130
|
buf, format="png"
|
131
131
|
) # as long as the above library is installed, this will work
|
132
|
-
|
132
|
+
figures.append(buf.getvalue())
|
133
133
|
|
134
134
|
string = "_disparity"
|
135
135
|
metrics_adj = [x + string for x in metrics]
|
136
136
|
|
137
137
|
table = bdf[["attribute_name", "attribute_value"] + b.list_disparities(bdf)]
|
138
|
-
|
139
|
-
plots_return = tuple(plots)
|
138
|
+
figures.append(aqp.plot_disparity_all(bdf, metrics=metrics_adj))
|
140
139
|
|
141
|
-
return (table, *
|
140
|
+
return (table, *figures)
|
@@ -103,10 +103,7 @@ def ProtectedClassesThresholdOptimizer(
|
|
103
103
|
test_df, target, y_pred_opt, protected_classes
|
104
104
|
)
|
105
105
|
|
106
|
-
return (
|
107
|
-
{"DPR and EOR Table": fairness_metrics.reset_index()},
|
108
|
-
fig,
|
109
|
-
)
|
106
|
+
return {"DPR and EOR Table": fairness_metrics.reset_index()}, fig
|
110
107
|
|
111
108
|
|
112
109
|
def initialize_and_fit_optimizer(pipeline, X_train, y_train, protected_classes_df):
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import matplotlib.pyplot as plt
|
6
6
|
import pandas as pd
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
from validmind.errors import SkipTestError
|
10
10
|
from validmind.vm_models import VMDataset
|
11
11
|
|
@@ -95,13 +95,24 @@ def RollingStatsPlot(dataset: VMDataset, window_size: int = 12):
|
|
95
95
|
if not pd.api.types.is_datetime64_any_dtype(dataset.df.index):
|
96
96
|
raise SkipTestError("Index must be a datetime type")
|
97
97
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
98
|
+
figures = [
|
99
|
+
plot_rolling_statistics(
|
100
|
+
df=dataset.df.dropna(),
|
101
|
+
col=col,
|
102
|
+
window_size=window_size,
|
103
|
+
)
|
104
|
+
for col in dataset.feature_columns
|
105
|
+
]
|
106
|
+
|
107
|
+
return (
|
108
|
+
*figures,
|
109
|
+
RawData(
|
110
|
+
rolling_means_stds={
|
111
|
+
col: {
|
112
|
+
"rolling_mean": dataset.df[col].rolling(window=window_size).mean(),
|
113
|
+
"rolling_std": dataset.df[col].rolling(window=window_size).std(),
|
114
|
+
}
|
115
|
+
for col in dataset.feature_columns
|
116
|
+
}
|
117
|
+
),
|
107
118
|
)
|
@@ -55,8 +55,8 @@ def ScatterPlot(dataset):
|
|
55
55
|
- Assumes that the dataset can fit into the computer's memory, which might not be valid for extremely large
|
56
56
|
datasets.
|
57
57
|
"""
|
58
|
-
|
59
58
|
g = sns.pairplot(data=dataset.df, diag_kind="kde")
|
59
|
+
|
60
60
|
for ax in g.axes.flatten():
|
61
61
|
# rotate x axis labels
|
62
62
|
ax.set_xlabel(ax.get_xlabel(), rotation=45)
|
@@ -64,12 +64,10 @@ def ScatterPlot(dataset):
|
|
64
64
|
ax.set_ylabel(ax.get_ylabel(), rotation=45)
|
65
65
|
# set y labels alignment
|
66
66
|
ax.yaxis.get_label().set_horizontalalignment("right")
|
67
|
+
|
67
68
|
# Get the current figure
|
68
69
|
fig = plt.gcf()
|
69
70
|
|
70
|
-
figures = []
|
71
|
-
figures.append(fig)
|
72
|
-
|
73
71
|
plt.close("all")
|
74
72
|
|
75
|
-
return
|
73
|
+
return fig
|
@@ -2,8 +2,9 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import pandas as pd
|
6
5
|
import numpy as np
|
6
|
+
import pandas as pd
|
7
|
+
|
7
8
|
from validmind import tags, tasks
|
8
9
|
from validmind.vm_models import VMDataset, VMModel
|
9
10
|
|
@@ -9,7 +9,7 @@ from plotly.subplots import make_subplots
|
|
9
9
|
from scipy import stats
|
10
10
|
from statsmodels.tsa.seasonal import seasonal_decompose
|
11
11
|
|
12
|
-
from validmind import tags, tasks
|
12
|
+
from validmind import RawData, tags, tasks
|
13
13
|
from validmind.errors import SkipTestError
|
14
14
|
from validmind.logging import get_logger
|
15
15
|
from validmind.vm_models import VMDataset
|
@@ -65,6 +65,8 @@ def SeasonalDecompose(dataset: VMDataset, seasonal_model: str = "additive"):
|
|
65
65
|
|
66
66
|
figures = []
|
67
67
|
|
68
|
+
raw_data = {}
|
69
|
+
|
68
70
|
for col in df.columns:
|
69
71
|
series = df[col].dropna()
|
70
72
|
|
@@ -153,7 +155,15 @@ def SeasonalDecompose(dataset: VMDataset, seasonal_model: str = "additive"):
|
|
153
155
|
|
154
156
|
figures.append(fig)
|
155
157
|
|
158
|
+
# Add the decomposed components to raw_data
|
159
|
+
raw_data[col] = {
|
160
|
+
"observed": sd.observed,
|
161
|
+
"trend": sd.trend,
|
162
|
+
"seasonal": sd.seasonal,
|
163
|
+
"residuals": sd.resid,
|
164
|
+
}
|
165
|
+
|
156
166
|
if not figures:
|
157
167
|
raise SkipTestError("No valid features found for seasonal decomposition")
|
158
168
|
|
159
|
-
return
|
169
|
+
return (*figures, RawData(decomposed_components=raw_data))
|