validmind 2.7.6__py3-none-any.whl → 2.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +2 -0
- validmind/__version__.py +1 -1
- validmind/api_client.py +8 -1
- validmind/datasets/credit_risk/lending_club.py +3 -4
- validmind/html_templates/content_blocks.py +1 -1
- validmind/tests/__types__.py +17 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +6 -2
- validmind/tests/data_validation/AutoMA.py +2 -2
- validmind/tests/data_validation/BivariateScatterPlots.py +4 -2
- validmind/tests/data_validation/BoxPierce.py +2 -2
- validmind/tests/data_validation/ClassImbalance.py +2 -1
- validmind/tests/data_validation/DatasetDescription.py +11 -2
- validmind/tests/data_validation/DatasetSplit.py +2 -2
- validmind/tests/data_validation/DickeyFullerGLS.py +2 -2
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +8 -2
- validmind/tests/data_validation/HighCardinality.py +9 -2
- validmind/tests/data_validation/HighPearsonCorrelation.py +6 -2
- validmind/tests/data_validation/IQROutliersBarPlot.py +9 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +2 -2
- validmind/tests/data_validation/MissingValuesBarPlot.py +12 -9
- validmind/tests/data_validation/MutualInformation.py +6 -8
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +2 -2
- validmind/tests/data_validation/ProtectedClassesCombination.py +6 -1
- validmind/tests/data_validation/ProtectedClassesDescription.py +1 -1
- validmind/tests/data_validation/ProtectedClassesDisparity.py +4 -5
- validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +1 -4
- validmind/tests/data_validation/RollingStatsPlot.py +21 -10
- validmind/tests/data_validation/ScatterPlot.py +3 -5
- validmind/tests/data_validation/ScoreBandDefaultRates.py +2 -1
- validmind/tests/data_validation/SeasonalDecompose.py +12 -2
- validmind/tests/data_validation/Skewness.py +6 -3
- validmind/tests/data_validation/SpreadPlot.py +8 -3
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +4 -2
- validmind/tests/data_validation/TabularDateTimeHistograms.py +2 -2
- validmind/tests/data_validation/TargetRateBarPlots.py +4 -3
- validmind/tests/data_validation/TimeSeriesFrequency.py +7 -2
- validmind/tests/data_validation/TimeSeriesMissingValues.py +14 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +1 -5
- validmind/tests/data_validation/WOEBinPlots.py +2 -2
- validmind/tests/data_validation/WOEBinTable.py +11 -9
- validmind/tests/data_validation/nlp/CommonWords.py +2 -2
- validmind/tests/data_validation/nlp/Hashtags.py +2 -2
- validmind/tests/data_validation/nlp/LanguageDetection.py +9 -6
- validmind/tests/data_validation/nlp/Mentions.py +9 -6
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -2
- validmind/tests/data_validation/nlp/Punctuations.py +4 -2
- validmind/tests/data_validation/nlp/Sentiment.py +2 -2
- validmind/tests/data_validation/nlp/StopWords.py +5 -4
- validmind/tests/data_validation/nlp/TextDescription.py +2 -2
- validmind/tests/data_validation/nlp/Toxicity.py +2 -2
- validmind/tests/model_validation/BertScore.py +2 -2
- validmind/tests/model_validation/BleuScore.py +2 -2
- validmind/tests/model_validation/ClusterSizeDistribution.py +2 -2
- validmind/tests/model_validation/ContextualRecall.py +2 -2
- validmind/tests/model_validation/FeaturesAUC.py +2 -2
- validmind/tests/model_validation/MeteorScore.py +2 -2
- validmind/tests/model_validation/ModelPredictionResiduals.py +2 -2
- validmind/tests/model_validation/RegardScore.py +6 -2
- validmind/tests/model_validation/RegressionResidualsPlot.py +4 -3
- validmind/tests/model_validation/RougeScore.py +6 -5
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +11 -2
- validmind/tests/model_validation/TokenDisparity.py +2 -2
- validmind/tests/model_validation/ToxicityScore.py +10 -2
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +9 -3
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +16 -2
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +5 -3
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +2 -2
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +14 -4
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +2 -2
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +16 -2
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +2 -2
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -5
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +4 -2
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +4 -2
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +4 -2
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +4 -2
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +8 -6
- validmind/tests/model_validation/embeddings/utils.py +11 -1
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +2 -1
- validmind/tests/model_validation/ragas/AspectCritic.py +11 -7
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +2 -1
- validmind/tests/model_validation/ragas/ContextPrecision.py +2 -1
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +2 -1
- validmind/tests/model_validation/ragas/ContextRecall.py +2 -1
- validmind/tests/model_validation/ragas/Faithfulness.py +2 -1
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +2 -1
- validmind/tests/model_validation/ragas/ResponseRelevancy.py +2 -1
- validmind/tests/model_validation/ragas/SemanticSimilarity.py +2 -1
- validmind/tests/model_validation/sklearn/CalibrationCurve.py +3 -2
- validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +2 -5
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +5 -2
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +2 -2
- validmind/tests/model_validation/sklearn/FeatureImportance.py +1 -14
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +6 -3
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +2 -2
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +8 -4
- validmind/tests/model_validation/sklearn/ModelParameters.py +1 -0
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -3
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +2 -2
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +20 -16
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +4 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +7 -9
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +1 -3
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +2 -1
- validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +2 -1
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +5 -3
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +9 -1
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +1 -1
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +11 -4
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -3
- validmind/tests/model_validation/statsmodels/GINITable.py +7 -15
- validmind/tests/model_validation/statsmodels/Lilliefors.py +2 -2
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +2 -2
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +5 -2
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +5 -2
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +7 -7
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +2 -2
- validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py +3 -1
- validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py +4 -2
- validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py +4 -2
- validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py +3 -1
- validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py +3 -1
- validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py +3 -1
- validmind/tests/ongoing_monitoring/FeatureDrift.py +1 -0
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +1 -0
- validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +3 -1
- validmind/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.py +1 -0
- validmind/tests/ongoing_monitoring/ROCCurveDrift.py +3 -2
- validmind/tests/ongoing_monitoring/ScoreBandsDrift.py +4 -2
- validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py +3 -1
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -3
- validmind/tests/prompt_validation/Bias.py +13 -9
- validmind/tests/prompt_validation/Clarity.py +13 -9
- validmind/tests/prompt_validation/Conciseness.py +13 -9
- validmind/tests/prompt_validation/Delimitation.py +13 -9
- validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
- validmind/tests/prompt_validation/Robustness.py +6 -2
- validmind/tests/prompt_validation/Specificity.py +13 -9
- validmind/tests/run.py +6 -0
- validmind/utils.py +7 -8
- {validmind-2.7.6.dist-info → validmind-2.7.7.dist-info}/METADATA +1 -2
- {validmind-2.7.6.dist-info → validmind-2.7.7.dist-info}/RECORD +147 -147
- {validmind-2.7.6.dist-info → validmind-2.7.7.dist-info}/WHEEL +1 -1
- {validmind-2.7.6.dist-info → validmind-2.7.7.dist-info}/LICENSE +0 -0
- {validmind-2.7.6.dist-info → validmind-2.7.7.dist-info}/entry_points.txt +0 -0
@@ -9,7 +9,7 @@ from plotly.subplots import make_subplots
|
|
9
9
|
from scipy import stats
|
10
10
|
from statsmodels.tsa.seasonal import seasonal_decompose
|
11
11
|
|
12
|
-
from validmind import tags, tasks
|
12
|
+
from validmind import RawData, tags, tasks
|
13
13
|
from validmind.errors import SkipTestError
|
14
14
|
from validmind.logging import get_logger
|
15
15
|
from validmind.vm_models import VMDataset
|
@@ -65,6 +65,8 @@ def SeasonalDecompose(dataset: VMDataset, seasonal_model: str = "additive"):
|
|
65
65
|
|
66
66
|
figures = []
|
67
67
|
|
68
|
+
raw_data = {}
|
69
|
+
|
68
70
|
for col in df.columns:
|
69
71
|
series = df[col].dropna()
|
70
72
|
|
@@ -153,7 +155,15 @@ def SeasonalDecompose(dataset: VMDataset, seasonal_model: str = "additive"):
|
|
153
155
|
|
154
156
|
figures.append(fig)
|
155
157
|
|
158
|
+
# Add the decomposed components to raw_data
|
159
|
+
raw_data[col] = {
|
160
|
+
"observed": sd.observed,
|
161
|
+
"trend": sd.trend,
|
162
|
+
"seasonal": sd.seasonal,
|
163
|
+
"residuals": sd.resid,
|
164
|
+
}
|
165
|
+
|
156
166
|
if not figures:
|
157
167
|
raise SkipTestError("No valid features found for seasonal decomposition")
|
158
168
|
|
159
|
-
return
|
169
|
+
return (*figures, RawData(decomposed_components=raw_data))
|
@@ -6,7 +6,7 @@ import matplotlib.pyplot as plt
|
|
6
6
|
import pandas as pd
|
7
7
|
import seaborn as sns
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.errors import SkipTestError
|
11
11
|
from validmind.vm_models import VMDataset
|
12
12
|
|
@@ -70,6 +70,7 @@ def SpreadPlot(dataset: VMDataset):
|
|
70
70
|
]
|
71
71
|
|
72
72
|
figures = []
|
73
|
+
spread_data = {}
|
73
74
|
|
74
75
|
for var1, var2 in feature_pairs:
|
75
76
|
fig, ax = plt.subplots()
|
@@ -80,8 +81,9 @@ def SpreadPlot(dataset: VMDataset):
|
|
80
81
|
y=0.95,
|
81
82
|
)
|
82
83
|
|
84
|
+
spread_series = df[var1] - df[var2]
|
83
85
|
sns.lineplot(
|
84
|
-
data=
|
86
|
+
data=spread_series,
|
85
87
|
ax=ax,
|
86
88
|
)
|
87
89
|
|
@@ -89,5 +91,8 @@ def SpreadPlot(dataset: VMDataset):
|
|
89
91
|
ax.tick_params(axis="both", labelsize=18)
|
90
92
|
|
91
93
|
figures.append(fig)
|
94
|
+
spread_data[f"{var1}_{var2}_spread"] = spread_series.to_frame(
|
95
|
+
name=f"spread_{var1}_{var2}"
|
96
|
+
)
|
92
97
|
|
93
|
-
return
|
98
|
+
return (*figures, RawData(spread_data=spread_data))
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
import plotly.graph_objs as go
|
6
6
|
|
7
|
-
from validmind import tags, tasks
|
7
|
+
from validmind import RawData, tags, tasks
|
8
8
|
from validmind.errors import SkipTestError
|
9
9
|
from validmind.vm_models import VMDataset
|
10
10
|
|
@@ -66,9 +66,11 @@ def TabularCategoricalBarPlots(dataset: VMDataset):
|
|
66
66
|
]
|
67
67
|
|
68
68
|
figures = []
|
69
|
+
counts_dict = {}
|
69
70
|
|
70
71
|
for col in dataset.feature_columns_categorical:
|
71
72
|
counts = dataset.df[col].value_counts()
|
73
|
+
counts_dict[col] = counts
|
72
74
|
|
73
75
|
fig = go.Figure()
|
74
76
|
fig.add_trace(
|
@@ -90,4 +92,4 @@ def TabularCategoricalBarPlots(dataset: VMDataset):
|
|
90
92
|
)
|
91
93
|
figures.append(fig)
|
92
94
|
|
93
|
-
return
|
95
|
+
return (*figures, RawData(category_counts=counts_dict))
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import pandas as pd
|
6
6
|
import plotly.graph_objects as go
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
from validmind.errors import SkipTestError
|
10
10
|
from validmind.vm_models import VMDataset
|
11
11
|
|
@@ -72,4 +72,4 @@ def TabularDateTimeHistograms(dataset: VMDataset):
|
|
72
72
|
font=dict(size=18),
|
73
73
|
)
|
74
74
|
|
75
|
-
return fig
|
75
|
+
return fig, RawData(date_differences=date_diffs)
|
@@ -6,7 +6,7 @@ import numpy as np
|
|
6
6
|
import plotly.graph_objs as go
|
7
7
|
from plotly.subplots import make_subplots
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.errors import SkipTestError
|
11
11
|
from validmind.vm_models import VMDataset
|
12
12
|
|
@@ -62,12 +62,13 @@ def TargetRateBarPlots(dataset: VMDataset):
|
|
62
62
|
|
63
63
|
df = dataset.df
|
64
64
|
figures = []
|
65
|
+
raw_data = []
|
65
66
|
|
66
67
|
for col in dataset.feature_columns_categorical:
|
67
|
-
|
68
68
|
# Calculate counts and default rate for each category
|
69
69
|
counts = df[col].value_counts()
|
70
70
|
default_rate = df.groupby(col)[dataset.target_column].mean()
|
71
|
+
raw_data.append({"column": col, "counts": counts, "default_rate": default_rate})
|
71
72
|
|
72
73
|
fig = make_subplots(
|
73
74
|
rows=1,
|
@@ -107,4 +108,4 @@ def TargetRateBarPlots(dataset: VMDataset):
|
|
107
108
|
|
108
109
|
figures.append(fig)
|
109
110
|
|
110
|
-
return
|
111
|
+
return (*figures, RawData(target_rates_by_category=raw_data))
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import pandas as pd
|
6
6
|
import plotly.graph_objects as go
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
from validmind.errors import SkipTestError
|
10
10
|
from validmind.vm_models import VMDataset
|
11
11
|
|
@@ -103,4 +103,9 @@ def TimeSeriesFrequency(dataset: VMDataset):
|
|
103
103
|
),
|
104
104
|
)
|
105
105
|
|
106
|
-
return
|
106
|
+
return (
|
107
|
+
frequencies,
|
108
|
+
fig,
|
109
|
+
len(set(item["Frequency"] for item in frequencies)) == 1,
|
110
|
+
RawData(time_diff_days=time_diff_days),
|
111
|
+
)
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
6
6
|
import plotly.express as px
|
7
7
|
import plotly.figure_factory as ff
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.errors import SkipTestError
|
11
11
|
from validmind.vm_models import VMDataset
|
12
12
|
|
@@ -62,15 +62,18 @@ def TimeSeriesMissingValues(dataset: VMDataset, min_threshold: int = 1):
|
|
62
62
|
|
63
63
|
if sum(missing.values) == 0:
|
64
64
|
# if theres no missing values, no need to plot anything
|
65
|
-
return
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
65
|
+
return (
|
66
|
+
[
|
67
|
+
{
|
68
|
+
"Column": col,
|
69
|
+
"Number of Missing Values": missing[col],
|
70
|
+
"Percentage of Missing Values (%)": 0,
|
71
|
+
"Pass/Fail": "Pass",
|
72
|
+
}
|
73
|
+
for col in missing.index
|
74
|
+
],
|
75
|
+
True,
|
76
|
+
)
|
74
77
|
|
75
78
|
barplot = px.bar(
|
76
79
|
missing,
|
@@ -110,4 +113,5 @@ def TimeSeriesMissingValues(dataset: VMDataset, min_threshold: int = 1):
|
|
110
113
|
barplot,
|
111
114
|
heatmap,
|
112
115
|
all(missing[col] < min_threshold for col in missing.index),
|
116
|
+
RawData(missing_values_count=missing, missing_values_mask=missing_mask),
|
113
117
|
)
|
@@ -111,8 +111,4 @@ def TimeSeriesOutliers(dataset: VMDataset, zscore_threshold: int = 3):
|
|
111
111
|
|
112
112
|
figures.append(fig)
|
113
113
|
|
114
|
-
return (
|
115
|
-
outlier_df.sort_values(["Column", "Date"]),
|
116
|
-
figures,
|
117
|
-
len(outlier_df) == 0,
|
118
|
-
)
|
114
|
+
return (outlier_df.sort_values(["Column", "Date"]), figures, len(outlier_df) == 0)
|
@@ -9,7 +9,7 @@ import plotly.graph_objects as go
|
|
9
9
|
import scorecardpy as sc
|
10
10
|
from plotly.subplots import make_subplots
|
11
11
|
|
12
|
-
from validmind import tags, tasks
|
12
|
+
from validmind import RawData, tags, tasks
|
13
13
|
from validmind.errors import SkipTestError
|
14
14
|
from validmind.logging import get_logger
|
15
15
|
from validmind.vm_models import VMDataset
|
@@ -140,4 +140,4 @@ def WOEBinPlots(
|
|
140
140
|
|
141
141
|
figures.append(fig)
|
142
142
|
|
143
|
-
return
|
143
|
+
return (*figures, RawData(woe_iv_data=woe_iv_df))
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import pandas as pd
|
6
6
|
import scorecardpy as sc
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
from validmind.errors import SkipTestError
|
10
10
|
from validmind.vm_models import VMDataset
|
11
11
|
|
@@ -61,12 +61,14 @@ def WOEBinTable(dataset: VMDataset, breaks_adj: list = None):
|
|
61
61
|
except Exception as e:
|
62
62
|
raise SkipTestError(f"Error during binning: {e}")
|
63
63
|
|
64
|
+
result_table = (
|
65
|
+
pd.concat(bins.values(), keys=bins.keys())
|
66
|
+
.reset_index()
|
67
|
+
.drop(columns=["variable"])
|
68
|
+
.rename(columns={"level_0": "variable"})
|
69
|
+
.assign(bin_number=lambda x: x.groupby("variable").cumcount())
|
70
|
+
)
|
71
|
+
|
64
72
|
return {
|
65
|
-
"Weight of Evidence (WoE) and Information Value (IV)":
|
66
|
-
|
67
|
-
.reset_index()
|
68
|
-
.drop(columns=["variable"])
|
69
|
-
.rename(columns={"level_0": "variable"})
|
70
|
-
.assign(bin_number=lambda x: x.groupby("variable").cumcount())
|
71
|
-
)
|
72
|
-
}
|
73
|
+
"Weight of Evidence (WoE) and Information Value (IV)": result_table
|
74
|
+
}, RawData(woe_bins=bins)
|
@@ -8,7 +8,7 @@ import nltk
|
|
8
8
|
import plotly.graph_objects as go
|
9
9
|
from nltk.corpus import stopwords
|
10
10
|
|
11
|
-
from validmind import tags, tasks
|
11
|
+
from validmind import RawData, tags, tasks
|
12
12
|
from validmind.vm_models import VMDataset
|
13
13
|
|
14
14
|
|
@@ -94,4 +94,4 @@ def CommonWords(dataset: VMDataset):
|
|
94
94
|
xaxis_tickangle=-45,
|
95
95
|
)
|
96
96
|
|
97
|
-
return fig
|
97
|
+
return fig, RawData(words=x, frequencies=y)
|
@@ -6,7 +6,7 @@ import re
|
|
6
6
|
|
7
7
|
import plotly.graph_objects as go
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.errors import SkipTestError
|
11
11
|
from validmind.vm_models import VMDataset
|
12
12
|
|
@@ -76,4 +76,4 @@ def Hashtags(dataset: VMDataset, top_hashtags: int = 25):
|
|
76
76
|
xaxis_tickangle=-45,
|
77
77
|
)
|
78
78
|
|
79
|
-
return fig
|
79
|
+
return fig, RawData(top_hashtag_counts=top_hashtag_counts)
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import plotly.express as px
|
6
6
|
from langdetect import LangDetectException, detect
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
|
10
10
|
|
11
11
|
@tags("nlp", "text_data", "visualization")
|
@@ -64,9 +64,12 @@ def LanguageDetection(dataset):
|
|
64
64
|
|
65
65
|
languages = dataset.df[dataset.text_column].apply(detect_language)
|
66
66
|
|
67
|
-
return
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
67
|
+
return (
|
68
|
+
px.histogram(
|
69
|
+
languages,
|
70
|
+
x=languages,
|
71
|
+
title="Language Distribution",
|
72
|
+
labels={"x": "Language Codes"},
|
73
|
+
),
|
74
|
+
RawData(detected_languages=languages),
|
72
75
|
)
|
@@ -7,7 +7,7 @@ import re
|
|
7
7
|
import pandas as pd
|
8
8
|
import plotly.express as px
|
9
9
|
|
10
|
-
from validmind import tags, tasks
|
10
|
+
from validmind import RawData, tags, tasks
|
11
11
|
from validmind.errors import SkipTestError
|
12
12
|
from validmind.vm_models import VMDataset
|
13
13
|
|
@@ -75,9 +75,12 @@ def Mentions(dataset: VMDataset, top_mentions: int = 25):
|
|
75
75
|
}
|
76
76
|
)
|
77
77
|
|
78
|
-
return
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
78
|
+
return (
|
79
|
+
px.treemap(
|
80
|
+
mention_frequencies_df,
|
81
|
+
path=["Scenario"],
|
82
|
+
values="Percentage",
|
83
|
+
title="Tree of Mentions",
|
84
|
+
),
|
85
|
+
RawData(mention_counts=mention_counts),
|
83
86
|
)
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
7
7
|
import plotly.express as px
|
8
8
|
from textblob import TextBlob
|
9
9
|
|
10
|
-
from validmind import tags, tasks
|
10
|
+
from validmind import RawData, tags, tasks
|
11
11
|
|
12
12
|
|
13
13
|
@tags("nlp", "text_data", "data_validation")
|
@@ -144,4 +144,4 @@ def PolarityAndSubjectivity(dataset, threshold_subjectivity=0.5, threshold_polar
|
|
144
144
|
|
145
145
|
statistics_tables = {"Quadrant Distribution": quadrant_df, "Statistics": stats_df}
|
146
146
|
|
147
|
-
return fig, statistics_tables
|
147
|
+
return fig, statistics_tables, RawData(sentiment_data=data)
|
@@ -11,7 +11,7 @@ from collections import defaultdict
|
|
11
11
|
|
12
12
|
import plotly.graph_objects as go
|
13
13
|
|
14
|
-
from validmind import tags, tasks
|
14
|
+
from validmind import RawData, tags, tasks
|
15
15
|
|
16
16
|
|
17
17
|
@tags("nlp", "text_data", "visualization", "frequency_analysis")
|
@@ -63,7 +63,9 @@ def Punctuations(dataset, count_mode="token"):
|
|
63
63
|
|
64
64
|
corpus = _create_corpus(dataset.df, dataset.text_column)
|
65
65
|
punctuation_counts = _count_punctuations(corpus, count_mode)
|
66
|
-
|
66
|
+
fig = _create_punctuation_plot(punctuation_counts)
|
67
|
+
|
68
|
+
return fig, RawData(punctuation_counts=punctuation_counts)
|
67
69
|
|
68
70
|
|
69
71
|
def _create_punctuation_plot(punctuation_counts):
|
@@ -8,7 +8,7 @@ import nltk
|
|
8
8
|
import seaborn as sns
|
9
9
|
from nltk.sentiment import SentimentIntensityAnalyzer
|
10
10
|
|
11
|
-
from validmind import tags, tasks
|
11
|
+
from validmind import RawData, tags, tasks
|
12
12
|
|
13
13
|
|
14
14
|
@tags("nlp", "text_data", "data_validation")
|
@@ -77,4 +77,4 @@ def Sentiment(dataset):
|
|
77
77
|
|
78
78
|
plt.close("all")
|
79
79
|
|
80
|
-
return fig
|
80
|
+
return fig, RawData(sentiment_scores=vader_sentiment.tolist())
|
@@ -13,7 +13,7 @@ import pandas as pd
|
|
13
13
|
import plotly.graph_objects as go
|
14
14
|
from nltk.corpus import stopwords
|
15
15
|
|
16
|
-
from validmind import tags, tasks
|
16
|
+
from validmind import RawData, tags, tasks
|
17
17
|
from validmind.vm_models import VMDataset
|
18
18
|
|
19
19
|
|
@@ -84,17 +84,17 @@ def StopWords(
|
|
84
84
|
nltk.download("stopwords", quiet=True)
|
85
85
|
|
86
86
|
stop = set(stopwords.words("english"))
|
87
|
-
|
87
|
+
stop_word_frequencies = defaultdict(int)
|
88
88
|
for word in corpus:
|
89
89
|
if word in stop:
|
90
|
-
|
90
|
+
stop_word_frequencies[word] += 1
|
91
91
|
|
92
92
|
# Calculate the total number of words in the corpus
|
93
93
|
total_words = len(corpus)
|
94
94
|
|
95
95
|
# Calculate the percentage of each word in the corpus
|
96
96
|
word_percentages = {}
|
97
|
-
for word, count in
|
97
|
+
for word, count in stop_word_frequencies.items():
|
98
98
|
percentage = (count / total_words) * 100
|
99
99
|
word_percentages[word] = percentage
|
100
100
|
|
@@ -124,4 +124,5 @@ def StopWords(
|
|
124
124
|
},
|
125
125
|
fig,
|
126
126
|
passed,
|
127
|
+
RawData(stop_word_frequencies=stop_word_frequencies, total_words=total_words),
|
127
128
|
)
|
@@ -9,7 +9,7 @@ import pandas as pd
|
|
9
9
|
import plotly.express as px
|
10
10
|
from nltk.corpus import stopwords
|
11
11
|
|
12
|
-
from validmind import tags, tasks
|
12
|
+
from validmind import RawData, tags, tasks
|
13
13
|
from validmind.vm_models import VMDataset
|
14
14
|
|
15
15
|
|
@@ -173,4 +173,4 @@ def TextDescription(
|
|
173
173
|
)
|
174
174
|
)
|
175
175
|
|
176
|
-
return
|
176
|
+
return (*figures, RawData(metrics_dataframe=metrics_df))
|
@@ -6,7 +6,7 @@ import evaluate
|
|
6
6
|
import matplotlib.pyplot as plt
|
7
7
|
import seaborn as sns
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
|
11
11
|
|
12
12
|
@tags("nlp", "text_data", "data_validation")
|
@@ -73,4 +73,4 @@ def Toxicity(dataset):
|
|
73
73
|
|
74
74
|
plt.close()
|
75
75
|
|
76
|
-
return fig
|
76
|
+
return fig, RawData(toxicity_scores=toxicity_scores)
|
@@ -6,7 +6,7 @@ import evaluate
|
|
6
6
|
import pandas as pd
|
7
7
|
import plotly.graph_objects as go
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.tests.utils import validate_prediction
|
11
11
|
|
12
12
|
|
@@ -131,4 +131,4 @@ def BertScore(
|
|
131
131
|
# Create a DataFrame from all collected statistics
|
132
132
|
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
133
133
|
|
134
|
-
return (result_df, *
|
134
|
+
return (result_df, *figures, RawData(bert_scores_df=metrics_df))
|
@@ -6,7 +6,7 @@ import evaluate
|
|
6
6
|
import pandas as pd
|
7
7
|
import plotly.graph_objects as go
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.tests.utils import validate_prediction
|
11
11
|
|
12
12
|
|
@@ -114,4 +114,4 @@ def BleuScore(dataset, model):
|
|
114
114
|
# Create a DataFrame from all collected statistics
|
115
115
|
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
116
116
|
|
117
|
-
return (result_df, *
|
117
|
+
return (result_df, *figures, RawData(bleu_scores_df=metrics_df))
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import pandas as pd
|
6
6
|
import plotly.graph_objects as go
|
7
7
|
|
8
|
-
from validmind import tags, tasks
|
8
|
+
from validmind import RawData, tags, tasks
|
9
9
|
from validmind.vm_models import VMDataset, VMModel
|
10
10
|
|
11
11
|
|
@@ -72,4 +72,4 @@ def ClusterSizeDistribution(dataset: VMDataset, model: VMModel):
|
|
72
72
|
fig.update_yaxes(title_text="Counts", showgrid=False)
|
73
73
|
fig.update_layout(title_text="Cluster distribution", title_x=0.5, barmode="group")
|
74
74
|
|
75
|
-
return fig
|
75
|
+
return fig, RawData(cluster_counts=df_counts)
|
@@ -6,7 +6,7 @@ import nltk
|
|
6
6
|
import pandas as pd
|
7
7
|
import plotly.graph_objects as go
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.tests.utils import validate_prediction
|
11
11
|
|
12
12
|
|
@@ -118,4 +118,4 @@ def ContextualRecall(dataset, model):
|
|
118
118
|
# Create a DataFrame from all collected statistics
|
119
119
|
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
120
120
|
|
121
|
-
return (result_df, *tuple(figures))
|
121
|
+
return (result_df, *tuple(figures), RawData(contextual_recall_scores=metrics_df))
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
7
7
|
import plotly.graph_objects as go
|
8
8
|
from sklearn.metrics import roc_auc_score
|
9
9
|
|
10
|
-
from validmind import tags, tasks
|
10
|
+
from validmind import RawData, tags, tasks
|
11
11
|
from validmind.errors import SkipTestError
|
12
12
|
from validmind.logging import get_logger
|
13
13
|
from validmind.vm_models import VMDataset
|
@@ -95,4 +95,4 @@ def FeaturesAUC(dataset: VMDataset, fontsize: int = 12, figure_height: int = 500
|
|
95
95
|
height=figure_height,
|
96
96
|
)
|
97
97
|
|
98
|
-
return fig
|
98
|
+
return fig, RawData(feature_aucs=aucs)
|
@@ -6,7 +6,7 @@ import evaluate
|
|
6
6
|
import pandas as pd
|
7
7
|
import plotly.graph_objects as go
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.tests.utils import validate_prediction
|
11
11
|
|
12
12
|
|
@@ -117,4 +117,4 @@ def MeteorScore(dataset, model):
|
|
117
117
|
# Create a DataFrame from all collected statistics
|
118
118
|
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
119
119
|
|
120
|
-
return (result_df, *tuple(figures))
|
120
|
+
return (result_df, *tuple(figures), RawData(meteor_scores=metrics_df))
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
6
6
|
import plotly.graph_objects as go
|
7
7
|
from scipy.stats import kstest
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
|
11
11
|
|
12
12
|
@tags("regression")
|
@@ -102,4 +102,4 @@ def ModelPredictionResiduals(
|
|
102
102
|
# Create a summary DataFrame for the KS normality test results
|
103
103
|
summary_df = pd.DataFrame([summary])
|
104
104
|
|
105
|
-
return (summary_df, *figures)
|
105
|
+
return (summary_df, *figures, RawData(residuals=residuals))
|
@@ -6,7 +6,7 @@ import evaluate
|
|
6
6
|
import pandas as pd
|
7
7
|
import plotly.graph_objects as go
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.tests.utils import validate_prediction
|
11
11
|
|
12
12
|
|
@@ -142,4 +142,8 @@ def RegardScore(dataset, model):
|
|
142
142
|
]
|
143
143
|
]
|
144
144
|
|
145
|
-
return (
|
145
|
+
return (
|
146
|
+
result_df,
|
147
|
+
*figures,
|
148
|
+
RawData(true_regard=true_df, pred_regard=pred_df),
|
149
|
+
)
|
@@ -6,7 +6,7 @@ import numpy as np
|
|
6
6
|
import plotly.figure_factory as ff
|
7
7
|
import plotly.graph_objects as go
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
from validmind.vm_models import VMDataset, VMModel
|
11
11
|
|
12
12
|
|
@@ -60,8 +60,9 @@ def RegressionResidualsPlot(model: VMModel, dataset: VMDataset, bin_size: float
|
|
60
60
|
figures = []
|
61
61
|
|
62
62
|
# Residuals plot
|
63
|
+
residuals = y_true.flatten() - y_pred.flatten()
|
63
64
|
fig = ff.create_distplot(
|
64
|
-
hist_data=[
|
65
|
+
hist_data=[residuals],
|
65
66
|
group_labels=["Residuals"],
|
66
67
|
bin_size=[bin_size],
|
67
68
|
show_hist=True,
|
@@ -104,4 +105,4 @@ def RegressionResidualsPlot(model: VMModel, dataset: VMDataset, bin_size: float
|
|
104
105
|
)
|
105
106
|
)
|
106
107
|
|
107
|
-
return
|
108
|
+
return (*figures, RawData(residuals=residuals, y_true=y_true, y_pred=y_pred))
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
6
6
|
import plotly.graph_objects as go
|
7
7
|
from rouge import Rouge
|
8
8
|
|
9
|
-
from validmind import tags, tasks
|
9
|
+
from validmind import RawData, tags, tasks
|
10
10
|
|
11
11
|
|
12
12
|
@tags("nlp", "text_data", "visualization")
|
@@ -118,7 +118,8 @@ def RougeScore(dataset, model, metric="rouge-1"):
|
|
118
118
|
{"p": "Precision", "r": "Recall", "f": "F1 Score"}
|
119
119
|
)
|
120
120
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
121
|
+
return (
|
122
|
+
pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"}),
|
123
|
+
*figures,
|
124
|
+
RawData(rouge_scores_df=df_scores),
|
125
|
+
)
|