validmind 2.8.10__py3-none-any.whl → 2.8.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +6 -5
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +17 -11
- validmind/ai/utils.py +2 -2
- validmind/api_client.py +75 -32
- validmind/client.py +108 -100
- validmind/client_config.py +3 -3
- validmind/datasets/classification/__init__.py +7 -3
- validmind/datasets/credit_risk/lending_club.py +28 -16
- validmind/datasets/nlp/cnn_dailymail.py +10 -4
- validmind/datasets/regression/__init__.py +22 -5
- validmind/errors.py +17 -7
- validmind/input_registry.py +1 -1
- validmind/logging.py +44 -35
- validmind/models/foundation.py +2 -2
- validmind/models/function.py +10 -3
- validmind/template.py +30 -22
- validmind/test_suites/__init__.py +2 -2
- validmind/tests/_store.py +13 -4
- validmind/tests/comparison.py +65 -33
- validmind/tests/data_validation/ACFandPACFPlot.py +4 -1
- validmind/tests/data_validation/AutoMA.py +1 -1
- validmind/tests/data_validation/BivariateScatterPlots.py +5 -1
- validmind/tests/data_validation/BoxPierce.py +3 -1
- validmind/tests/data_validation/ClassImbalance.py +4 -2
- validmind/tests/data_validation/DatasetDescription.py +3 -24
- validmind/tests/data_validation/DescriptiveStatistics.py +1 -1
- validmind/tests/data_validation/DickeyFullerGLS.py +1 -1
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +1 -1
- validmind/tests/data_validation/HighCardinality.py +5 -1
- validmind/tests/data_validation/HighPearsonCorrelation.py +1 -1
- validmind/tests/data_validation/IQROutliersBarPlot.py +5 -3
- validmind/tests/data_validation/IQROutliersTable.py +5 -2
- validmind/tests/data_validation/IsolationForestOutliers.py +5 -4
- validmind/tests/data_validation/JarqueBera.py +2 -2
- validmind/tests/data_validation/LJungBox.py +2 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
- validmind/tests/data_validation/MissingValues.py +14 -10
- validmind/tests/data_validation/MissingValuesBarPlot.py +3 -1
- validmind/tests/data_validation/MutualInformation.py +2 -1
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +1 -1
- validmind/tests/data_validation/ProtectedClassesCombination.py +2 -0
- validmind/tests/data_validation/ProtectedClassesDescription.py +2 -2
- validmind/tests/data_validation/ProtectedClassesDisparity.py +9 -5
- validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +10 -2
- validmind/tests/data_validation/RollingStatsPlot.py +2 -1
- validmind/tests/data_validation/ScoreBandDefaultRates.py +4 -2
- validmind/tests/data_validation/SeasonalDecompose.py +1 -1
- validmind/tests/data_validation/ShapiroWilk.py +2 -2
- validmind/tests/data_validation/Skewness.py +7 -6
- validmind/tests/data_validation/SpreadPlot.py +1 -1
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +1 -1
- validmind/tests/data_validation/TabularDateTimeHistograms.py +1 -1
- validmind/tests/data_validation/TargetRateBarPlots.py +4 -1
- validmind/tests/data_validation/TimeSeriesFrequency.py +1 -1
- validmind/tests/data_validation/TimeSeriesOutliers.py +7 -2
- validmind/tests/data_validation/WOEBinPlots.py +1 -1
- validmind/tests/data_validation/WOEBinTable.py +1 -1
- validmind/tests/data_validation/ZivotAndrewsArch.py +5 -2
- validmind/tests/data_validation/nlp/CommonWords.py +1 -1
- validmind/tests/data_validation/nlp/Hashtags.py +1 -1
- validmind/tests/data_validation/nlp/LanguageDetection.py +1 -1
- validmind/tests/data_validation/nlp/Mentions.py +1 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +5 -1
- validmind/tests/data_validation/nlp/Punctuations.py +1 -1
- validmind/tests/data_validation/nlp/Sentiment.py +3 -1
- validmind/tests/data_validation/nlp/TextDescription.py +1 -1
- validmind/tests/data_validation/nlp/Toxicity.py +1 -1
- validmind/tests/decorator.py +14 -11
- validmind/tests/load.py +38 -24
- validmind/tests/model_validation/BertScore.py +7 -1
- validmind/tests/model_validation/BleuScore.py +7 -1
- validmind/tests/model_validation/ClusterSizeDistribution.py +3 -1
- validmind/tests/model_validation/ContextualRecall.py +9 -1
- validmind/tests/model_validation/FeaturesAUC.py +1 -1
- validmind/tests/model_validation/MeteorScore.py +7 -1
- validmind/tests/model_validation/ModelPredictionResiduals.py +5 -1
- validmind/tests/model_validation/RegardScore.py +6 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -1
- validmind/tests/model_validation/RougeScore.py +3 -1
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +2 -0
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +10 -2
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +6 -2
- validmind/tests/model_validation/TokenDisparity.py +5 -1
- validmind/tests/model_validation/ToxicityScore.py +2 -0
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +5 -1
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +5 -1
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +5 -1
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +2 -0
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +5 -1
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +6 -2
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +3 -1
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +5 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +5 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +5 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +5 -1
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +6 -1
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -3
- validmind/tests/model_validation/ragas/AspectCritic.py +4 -1
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +5 -3
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -3
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +5 -3
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -3
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -3
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +1 -1
- validmind/tests/model_validation/ragas/ResponseRelevancy.py +5 -3
- validmind/tests/model_validation/ragas/SemanticSimilarity.py +5 -3
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +9 -9
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +9 -9
- validmind/tests/model_validation/sklearn/CalibrationCurve.py +5 -2
- validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +28 -5
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +5 -1
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +24 -14
- validmind/tests/model_validation/sklearn/CompletenessScore.py +8 -9
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -3
- validmind/tests/model_validation/sklearn/FeatureImportance.py +6 -2
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -9
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +14 -9
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +4 -2
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +6 -1
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +12 -7
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +12 -7
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +21 -6
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +11 -3
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +5 -1
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -1
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +6 -1
- validmind/tests/model_validation/sklearn/ROCCurve.py +3 -1
- validmind/tests/model_validation/sklearn/RegressionErrors.py +6 -2
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +13 -8
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +8 -5
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +5 -1
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +34 -26
- validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +10 -2
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +5 -1
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -9
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +15 -10
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +5 -1
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +6 -1
- validmind/tests/model_validation/statsmodels/GINITable.py +8 -1
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +2 -2
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +6 -2
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +8 -2
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +3 -1
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +7 -2
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -0
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +2 -0
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +4 -2
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +3 -1
- validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py +11 -1
- validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py +10 -2
- validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py +8 -1
- validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py +18 -2
- validmind/tests/ongoing_monitoring/FeatureDrift.py +9 -2
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +8 -2
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +13 -2
- validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +13 -2
- validmind/tests/ongoing_monitoring/ROCCurveDrift.py +16 -2
- validmind/tests/ongoing_monitoring/ScoreBandsDrift.py +11 -2
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +13 -2
- validmind/tests/output.py +66 -11
- validmind/tests/prompt_validation/Clarity.py +1 -1
- validmind/tests/prompt_validation/NegativeInstruction.py +1 -1
- validmind/tests/prompt_validation/Robustness.py +6 -1
- validmind/tests/prompt_validation/Specificity.py +1 -1
- validmind/tests/run.py +28 -14
- validmind/tests/test_providers.py +28 -35
- validmind/tests/utils.py +17 -4
- validmind/unit_metrics/__init__.py +1 -1
- validmind/utils.py +295 -31
- validmind/vm_models/dataset/dataset.py +19 -16
- validmind/vm_models/dataset/utils.py +5 -3
- validmind/vm_models/figure.py +6 -6
- validmind/vm_models/input.py +6 -5
- validmind/vm_models/model.py +5 -5
- validmind/vm_models/result/result.py +122 -43
- validmind/vm_models/result/utils.py +9 -28
- validmind/vm_models/test_suite/__init__.py +5 -0
- validmind/vm_models/test_suite/runner.py +5 -5
- validmind/vm_models/test_suite/summary.py +20 -2
- validmind/vm_models/test_suite/test.py +6 -6
- validmind/vm_models/test_suite/test_suite.py +10 -10
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/METADATA +4 -5
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/RECORD +189 -188
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/WHEEL +1 -1
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/LICENSE +0 -0
- {validmind-2.8.10.dist-info → validmind-2.8.20.dist-info}/entry_points.txt +0 -0
@@ -80,5 +80,9 @@ def BivariateScatterPlots(dataset):
|
|
80
80
|
figures.append(fig)
|
81
81
|
|
82
82
|
return tuple(figures) + (
|
83
|
-
RawData(
|
83
|
+
RawData(
|
84
|
+
selected_numerical_df=df,
|
85
|
+
feature_pairs=features_pairs,
|
86
|
+
dataset=dataset.input_id,
|
87
|
+
),
|
84
88
|
)
|
@@ -68,4 +68,6 @@ def BoxPierce(dataset):
|
|
68
68
|
box_pierce_df.reset_index(inplace=True)
|
69
69
|
box_pierce_df.columns = ["column", "stat", "pvalue"]
|
70
70
|
|
71
|
-
return box_pierce_df, RawData(
|
71
|
+
return box_pierce_df, RawData(
|
72
|
+
box_pierce_values=box_pierce_values, dataset=dataset.input_id
|
73
|
+
)
|
@@ -14,7 +14,9 @@ from validmind.errors import SkipTestError
|
|
14
14
|
from validmind.vm_models import VMDataset
|
15
15
|
|
16
16
|
|
17
|
-
@tags(
|
17
|
+
@tags(
|
18
|
+
"tabular_data", "binary_classification", "multiclass_classification", "data_quality"
|
19
|
+
)
|
18
20
|
@tasks("classification")
|
19
21
|
def ClassImbalance(
|
20
22
|
dataset: VMDataset, min_percent_threshold: int = 10
|
@@ -104,5 +106,5 @@ def ClassImbalance(
|
|
104
106
|
},
|
105
107
|
go.Figure(data=[trace], layout=layout),
|
106
108
|
all(row["Pass/Fail"] == "Pass" for row in imbalanced_classes),
|
107
|
-
RawData(imbalance_percentages=imbalance_percentages),
|
109
|
+
RawData(imbalance_percentages=imbalance_percentages, dataset=dataset.input_id),
|
108
110
|
)
|
@@ -6,12 +6,10 @@ import re
|
|
6
6
|
from collections import Counter
|
7
7
|
|
8
8
|
import numpy as np
|
9
|
-
from ydata_profiling.config import Settings
|
10
|
-
from ydata_profiling.model.typeset import ProfilingTypeSet
|
11
9
|
|
12
10
|
from validmind import RawData, tags, tasks
|
13
|
-
from validmind.errors import UnsupportedColumnTypeError
|
14
11
|
from validmind.logging import get_logger
|
12
|
+
from validmind.utils import infer_datatypes
|
15
13
|
from validmind.vm_models import VMDataset
|
16
14
|
|
17
15
|
DEFAULT_HISTOGRAM_BINS = 10
|
@@ -20,25 +18,6 @@ DEFAULT_HISTOGRAM_BIN_SIZES = [5, 10, 20, 50]
|
|
20
18
|
logger = get_logger(__name__)
|
21
19
|
|
22
20
|
|
23
|
-
def infer_datatypes(df):
|
24
|
-
column_type_mappings = {}
|
25
|
-
typeset = ProfilingTypeSet(Settings())
|
26
|
-
variable_types = typeset.infer_type(df)
|
27
|
-
|
28
|
-
for column, type in variable_types.items():
|
29
|
-
if str(type) == "Unsupported":
|
30
|
-
if df[column].isnull().all():
|
31
|
-
column_type_mappings[column] = {"id": column, "type": "Null"}
|
32
|
-
else:
|
33
|
-
raise UnsupportedColumnTypeError(
|
34
|
-
f"Unsupported type for column {column}. Please review all values in this dataset column."
|
35
|
-
)
|
36
|
-
else:
|
37
|
-
column_type_mappings[column] = {"id": column, "type": str(type)}
|
38
|
-
|
39
|
-
return list(column_type_mappings.values())
|
40
|
-
|
41
|
-
|
42
21
|
def get_numerical_histograms(df, column):
|
43
22
|
"""
|
44
23
|
Returns a collection of histograms for a numerical column, each one
|
@@ -50,7 +29,7 @@ def get_numerical_histograms(df, column):
|
|
50
29
|
# bins='sturges'. Cannot use 'auto' until we review and fix its performance
|
51
30
|
# on datasets with too many unique values
|
52
31
|
#
|
53
|
-
# 'sturges': R
|
32
|
+
# 'sturges': R's default method, only accounts for data size. Only optimal
|
54
33
|
# for gaussian data and underestimates number of bins for large non-gaussian datasets.
|
55
34
|
default_hist = np.histogram(values_cleaned, bins="sturges")
|
56
35
|
|
@@ -242,4 +221,4 @@ def DatasetDescription(dataset: VMDataset):
|
|
242
221
|
}
|
243
222
|
for column in results
|
244
223
|
]
|
245
|
-
}, RawData(raw_data=raw_data)
|
224
|
+
}, RawData(raw_data=raw_data, dataset=dataset.input_id)
|
@@ -44,7 +44,7 @@ def get_summary_statistics_categorical(df, categorical_fields):
|
|
44
44
|
return summary_stats
|
45
45
|
|
46
46
|
|
47
|
-
@tags("tabular_data", "time_series_data")
|
47
|
+
@tags("tabular_data", "time_series_data", "data_quality")
|
48
48
|
@tasks("classification", "regression")
|
49
49
|
def DescriptiveStatistics(dataset: VMDataset):
|
50
50
|
"""
|
@@ -58,7 +58,7 @@ def FeatureTargetCorrelationPlot(dataset, fig_height=600):
|
|
58
58
|
df, dataset.target_column, fig_height
|
59
59
|
)
|
60
60
|
|
61
|
-
return fig, RawData(correlation_data=correlations)
|
61
|
+
return fig, RawData(correlation_data=correlations, dataset=dataset.input_id)
|
62
62
|
|
63
63
|
|
64
64
|
def _visualize_feature_target_correlation(df, target_column, fig_height):
|
@@ -118,11 +118,13 @@ def IQROutliersBarPlot(
|
|
118
118
|
)
|
119
119
|
figures.append(fig)
|
120
120
|
|
121
|
+
outliers_by_feature = df[dataset.feature_columns_numeric].apply(
|
122
|
+
lambda col: compute_outliers(col, threshold)
|
123
|
+
)
|
124
|
+
|
121
125
|
return (
|
122
126
|
*figures,
|
123
127
|
RawData(
|
124
|
-
outlier_counts_by_feature=
|
125
|
-
lambda col: compute_outliers(col, threshold)
|
126
|
-
)
|
128
|
+
outlier_counts_by_feature=outliers_by_feature, dataset=dataset.input_id
|
127
129
|
),
|
128
130
|
)
|
@@ -2,7 +2,7 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from validmind import tags, tasks
|
5
|
+
from validmind import RawData, tags, tasks
|
6
6
|
from validmind.vm_models import VMDataset
|
7
7
|
|
8
8
|
|
@@ -64,6 +64,7 @@ def IQROutliersTable(dataset: VMDataset, threshold: float = 1.5):
|
|
64
64
|
df = dataset.df
|
65
65
|
|
66
66
|
outliers_table = []
|
67
|
+
all_outliers = {}
|
67
68
|
|
68
69
|
for col in dataset.feature_columns_numeric:
|
69
70
|
# Skip binary features
|
@@ -71,6 +72,8 @@ def IQROutliersTable(dataset: VMDataset, threshold: float = 1.5):
|
|
71
72
|
continue
|
72
73
|
|
73
74
|
outliers = compute_outliers(df[col], threshold)
|
75
|
+
all_outliers[col] = outliers
|
76
|
+
|
74
77
|
if outliers.empty:
|
75
78
|
continue
|
76
79
|
|
@@ -89,4 +92,4 @@ def IQROutliersTable(dataset: VMDataset, threshold: float = 1.5):
|
|
89
92
|
|
90
93
|
return {
|
91
94
|
"Summary of Outliers Detected by IQR Method": outliers_table,
|
92
|
-
}
|
95
|
+
}, RawData(all_outliers=all_outliers, dataset=dataset.input_id)
|
@@ -8,7 +8,7 @@ import matplotlib.pyplot as plt
|
|
8
8
|
import seaborn as sns
|
9
9
|
from sklearn.ensemble import IsolationForest
|
10
10
|
|
11
|
-
from validmind import tags, tasks
|
11
|
+
from validmind import RawData, tags, tasks
|
12
12
|
from validmind.vm_models import VMDataset
|
13
13
|
|
14
14
|
|
@@ -91,6 +91,7 @@ def IsolationForestOutliers(
|
|
91
91
|
|
92
92
|
figures.append(fig)
|
93
93
|
|
94
|
-
|
95
|
-
|
96
|
-
|
94
|
+
return (
|
95
|
+
*figures,
|
96
|
+
RawData(predictions=y_pred, dataset=dataset.input_id),
|
97
|
+
)
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import pandas as pd
|
6
6
|
from statsmodels.stats.stattools import jarque_bera
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
|
10
10
|
|
11
11
|
@tasks("classification", "regression")
|
@@ -67,4 +67,4 @@ def JarqueBera(dataset):
|
|
67
67
|
jb_df.reset_index(inplace=True)
|
68
68
|
jb_df.columns = ["column", "stat", "pvalue", "skew", "kurtosis"]
|
69
69
|
|
70
|
-
return jb_df
|
70
|
+
return jb_df, RawData(jb_values=jb_values, dataset=dataset.input_id)
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import pandas as pd
|
6
6
|
from statsmodels.stats.diagnostic import acorr_ljungbox
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
|
10
10
|
|
11
11
|
@tasks("regression")
|
@@ -63,4 +63,4 @@ def LJungBox(dataset):
|
|
63
63
|
ljung_box_df.reset_index(inplace=True)
|
64
64
|
ljung_box_df.columns = ["column", "stat", "pvalue"]
|
65
65
|
|
66
|
-
return ljung_box_df
|
66
|
+
return ljung_box_df, RawData(ljung_box_df=ljung_box_df, dataset=dataset.input_id)
|
@@ -2,7 +2,7 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from validmind import tags, tasks
|
5
|
+
from validmind import RawData, tags, tasks
|
6
6
|
from validmind.vm_models import VMDataset
|
7
7
|
|
8
8
|
|
@@ -49,12 +49,16 @@ def MissingValues(dataset: VMDataset, min_threshold: int = 1):
|
|
49
49
|
df = dataset.df
|
50
50
|
missing = df.isna().sum()
|
51
51
|
|
52
|
-
return
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
52
|
+
return (
|
53
|
+
[
|
54
|
+
{
|
55
|
+
"Column": col,
|
56
|
+
"Number of Missing Values": missing[col],
|
57
|
+
"Percentage of Missing Values (%)": missing[col] / df.shape[0] * 100,
|
58
|
+
"Pass/Fail": "Pass" if missing[col] < min_threshold else "Fail",
|
59
|
+
}
|
60
|
+
for col in missing.index
|
61
|
+
],
|
62
|
+
all(missing[col] < min_threshold for col in missing.index),
|
63
|
+
RawData(missing_values=missing, dataset=dataset.input_id),
|
64
|
+
)
|
@@ -6,7 +6,7 @@
|
|
6
6
|
import pandas as pd
|
7
7
|
import plotly.graph_objects as go
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.logging import get_logger
|
11
11
|
|
12
12
|
logger = get_logger(__name__)
|
@@ -127,4 +127,4 @@ def ProtectedClassesDescription(dataset, protected_classes=None):
|
|
127
127
|
["Protected Class", "Count"], ascending=[True, False]
|
128
128
|
)
|
129
129
|
|
130
|
-
return (stats_df, *figures)
|
130
|
+
return (stats_df, *figures, RawData(dataset=dataset.input_id))
|
@@ -7,7 +7,7 @@ import sys
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
-
from validmind import tags, tasks
|
10
|
+
from validmind import RawData, tags, tasks
|
11
11
|
from validmind.errors import MissingDependencyError
|
12
12
|
from validmind.logging import get_logger
|
13
13
|
|
@@ -119,7 +119,7 @@ def ProtectedClassesDisparity(
|
|
119
119
|
mask_significance=True,
|
120
120
|
)
|
121
121
|
|
122
|
-
|
122
|
+
returns = [] # Renamed to 'returns' for clarity
|
123
123
|
for protected_class in protected_classes:
|
124
124
|
plot = ap.disparity(
|
125
125
|
bdf, metrics, protected_class, fairness_threshold=disparity_tolerance
|
@@ -129,12 +129,16 @@ def ProtectedClassesDisparity(
|
|
129
129
|
plot.save(
|
130
130
|
buf, format="png"
|
131
131
|
) # as long as the above library is installed, this will work
|
132
|
-
|
132
|
+
returns.append(buf.getvalue())
|
133
133
|
|
134
134
|
string = "_disparity"
|
135
135
|
metrics_adj = [x + string for x in metrics]
|
136
136
|
|
137
137
|
table = bdf[["attribute_name", "attribute_value"] + b.list_disparities(bdf)]
|
138
|
-
|
138
|
+
returns.append(aqp.plot_disparity_all(bdf, metrics=metrics_adj))
|
139
139
|
|
140
|
-
return (
|
140
|
+
return (
|
141
|
+
table,
|
142
|
+
*returns,
|
143
|
+
RawData(model=model.input_id, dataset=dataset.input_id, disparity_data=bdf),
|
144
|
+
)
|
@@ -8,7 +8,7 @@ import sys
|
|
8
8
|
import matplotlib.pyplot as plt
|
9
9
|
import pandas as pd
|
10
10
|
|
11
|
-
from validmind import tags, tasks
|
11
|
+
from validmind import RawData, tags, tasks
|
12
12
|
from validmind.errors import MissingDependencyError
|
13
13
|
from validmind.logging import get_logger
|
14
14
|
|
@@ -103,7 +103,15 @@ def ProtectedClassesThresholdOptimizer(
|
|
103
103
|
test_df, target, y_pred_opt, protected_classes
|
104
104
|
)
|
105
105
|
|
106
|
-
return
|
106
|
+
return (
|
107
|
+
{"DPR and EOR Table": fairness_metrics.reset_index()},
|
108
|
+
fig,
|
109
|
+
RawData(
|
110
|
+
y_predictions=y_pred_opt.tolist(),
|
111
|
+
dataset=dataset.input_id,
|
112
|
+
protected_classes=protected_classes,
|
113
|
+
),
|
114
|
+
)
|
107
115
|
|
108
116
|
|
109
117
|
def initialize_and_fit_optimizer(pipeline, X_train, y_train, protected_classes_df):
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import numpy as np
|
6
6
|
import pandas as pd
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
from validmind.vm_models import VMDataset, VMModel
|
10
10
|
|
11
11
|
|
@@ -137,4 +137,6 @@ def ScoreBandDefaultRates(
|
|
137
137
|
}
|
138
138
|
)
|
139
139
|
|
140
|
-
return pd.DataFrame(results)
|
140
|
+
return pd.DataFrame(results), RawData(
|
141
|
+
results=results, model=model.input_id, dataset=dataset.input_id
|
142
|
+
)
|
@@ -166,4 +166,4 @@ def SeasonalDecompose(dataset: VMDataset, seasonal_model: str = "additive"):
|
|
166
166
|
if not figures:
|
167
167
|
raise SkipTestError("No valid features found for seasonal decomposition")
|
168
168
|
|
169
|
-
return (*figures, RawData(decomposed_components=raw_data))
|
169
|
+
return (*figures, RawData(decomposed_components=raw_data, dataset=dataset.input_id))
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import pandas as pd
|
6
6
|
from scipy import stats
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
|
10
10
|
|
11
11
|
@tasks("classification", "regression")
|
@@ -66,4 +66,4 @@ def ShapiroWilk(dataset):
|
|
66
66
|
sw_df.reset_index(inplace=True)
|
67
67
|
sw_df.columns = ["column", "stat", "pvalue"]
|
68
68
|
|
69
|
-
return sw_df
|
69
|
+
return sw_df, RawData(shapiro_results=sw_values, dataset=dataset.input_id)
|
@@ -2,10 +2,8 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from ydata_profiling.config import Settings
|
6
|
-
from ydata_profiling.model.typeset import ProfilingTypeSet
|
7
|
-
|
8
5
|
from validmind import tags, tasks
|
6
|
+
from validmind.utils import infer_datatypes
|
9
7
|
|
10
8
|
|
11
9
|
@tags("data_quality", "tabular_data")
|
@@ -49,8 +47,11 @@ def Skewness(dataset, max_threshold=1):
|
|
49
47
|
- Subjective threshold for risk grading, requiring expert input and recurrent iterations for refinement.
|
50
48
|
"""
|
51
49
|
|
52
|
-
|
53
|
-
dataset_types =
|
50
|
+
# Use the imported infer_datatypes function
|
51
|
+
dataset_types = infer_datatypes(dataset.df)
|
52
|
+
|
53
|
+
# Convert the list of dictionaries to a dictionary for easy access
|
54
|
+
dataset_types_dict = {item["id"]: item["type"] for item in dataset_types}
|
54
55
|
|
55
56
|
skewness = dataset.df.skew(numeric_only=True)
|
56
57
|
|
@@ -58,7 +59,7 @@ def Skewness(dataset, max_threshold=1):
|
|
58
59
|
passed = True
|
59
60
|
|
60
61
|
for col in skewness.index:
|
61
|
-
if
|
62
|
+
if dataset_types_dict.get(col) != "Numeric":
|
62
63
|
continue
|
63
64
|
|
64
65
|
col_skewness = skewness[col]
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import pandas as pd
|
6
6
|
import plotly.graph_objects as go
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
from validmind.errors import SkipTestError
|
10
10
|
from validmind.vm_models import VMDataset
|
11
11
|
|
@@ -111,4 +111,9 @@ def TimeSeriesOutliers(dataset: VMDataset, zscore_threshold: int = 3):
|
|
111
111
|
|
112
112
|
figures.append(fig)
|
113
113
|
|
114
|
-
return (
|
114
|
+
return (
|
115
|
+
outlier_df.sort_values(["Column", "Date"]),
|
116
|
+
figures,
|
117
|
+
len(outlier_df) == 0,
|
118
|
+
RawData(outliers=outlier_df, dataset=dataset.input_id),
|
119
|
+
)
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
6
6
|
from arch.unitroot import ZivotAndrews
|
7
7
|
from numpy.linalg import LinAlgError
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.errors import SkipTestError
|
11
11
|
from validmind.logging import get_logger
|
12
12
|
from validmind.vm_models import VMDataset
|
@@ -83,4 +83,7 @@ def ZivotAndrewsArch(dataset: VMDataset):
|
|
83
83
|
}
|
84
84
|
)
|
85
85
|
|
86
|
-
return
|
86
|
+
return (
|
87
|
+
{"Zivot-Andrews Test Results": za_values},
|
88
|
+
RawData(zivot_andrews=za_values, dataset=dataset.input_id),
|
89
|
+
)
|
@@ -144,4 +144,8 @@ def PolarityAndSubjectivity(dataset, threshold_subjectivity=0.5, threshold_polar
|
|
144
144
|
|
145
145
|
statistics_tables = {"Quadrant Distribution": quadrant_df, "Statistics": stats_df}
|
146
146
|
|
147
|
-
return
|
147
|
+
return (
|
148
|
+
fig,
|
149
|
+
statistics_tables,
|
150
|
+
RawData(sentiment_data=data, dataset=dataset.input_id),
|
151
|
+
)
|
@@ -65,7 +65,7 @@ def Punctuations(dataset, count_mode="token"):
|
|
65
65
|
punctuation_counts = _count_punctuations(corpus, count_mode)
|
66
66
|
fig = _create_punctuation_plot(punctuation_counts)
|
67
67
|
|
68
|
-
return fig, RawData(punctuation_counts=punctuation_counts)
|
68
|
+
return fig, RawData(punctuation_counts=punctuation_counts, dataset=dataset.input_id)
|
69
69
|
|
70
70
|
|
71
71
|
def _create_punctuation_plot(punctuation_counts):
|