validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.24.dist-info/METADATA +0 -118
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,22 +2,19 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
"""
|
6
|
-
Metrics functions for any Pandas-compatible datasets
|
7
|
-
"""
|
8
|
-
|
9
5
|
from collections import Counter
|
10
|
-
from dataclasses import dataclass
|
11
6
|
|
12
|
-
import matplotlib.pyplot as plt
|
13
7
|
import nltk
|
8
|
+
import plotly.graph_objects as go
|
14
9
|
from nltk.corpus import stopwords
|
15
10
|
|
16
|
-
from
|
11
|
+
from validmind import tags, tasks
|
12
|
+
from validmind.vm_models import VMDataset
|
17
13
|
|
18
14
|
|
19
|
-
@
|
20
|
-
|
15
|
+
@tags("nlp", "text_data", "visualization", "frequency_analysis")
|
16
|
+
@tasks("text_classification", "text_summarization")
|
17
|
+
def CommonWords(dataset: VMDataset):
|
21
18
|
"""
|
22
19
|
Assesses the most frequent non-stopwords in a text column for identifying prevalent language patterns.
|
23
20
|
|
@@ -31,8 +28,8 @@ class CommonWords(Metric):
|
|
31
28
|
|
32
29
|
The test methodology involves splitting the specified text column's entries into words, collating them into a
|
33
30
|
corpus, and then counting the frequency of each word using the Counter. The forty most frequently occurring
|
34
|
-
non-stopwords are then visualized in
|
35
|
-
their frequency of occurrence.
|
31
|
+
non-stopwords are then visualized in an interactive bar chart using Plotly, where the x-axis represents the words,
|
32
|
+
and the y-axis indicates their frequency of occurrence.
|
36
33
|
|
37
34
|
### Signs of High Risk
|
38
35
|
|
@@ -46,7 +43,7 @@ class CommonWords(Metric):
|
|
46
43
|
- The metric provides clear insights into the language features – specifically word frequency – of unstructured
|
47
44
|
text data.
|
48
45
|
- It can reveal prominent vocabulary and language patterns, which prove vital for feature extraction in NLP tasks.
|
49
|
-
- The visualization helps in quickly capturing the patterns and understanding the data intuitively.
|
46
|
+
- The interactive visualization helps in quickly capturing the patterns and understanding the data intuitively.
|
50
47
|
|
51
48
|
### Limitations
|
52
49
|
|
@@ -58,48 +55,43 @@ class CommonWords(Metric):
|
|
58
55
|
applicability.
|
59
56
|
"""
|
60
57
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
key=self.key,
|
102
|
-
figure=fig,
|
103
|
-
)
|
104
|
-
]
|
105
|
-
)
|
58
|
+
# Check text column
|
59
|
+
if not dataset.text_column:
|
60
|
+
raise ValueError("Please set text_column name in the Validmind Dataset object")
|
61
|
+
|
62
|
+
nltk.download("stopwords", quiet=True)
|
63
|
+
|
64
|
+
counter = Counter(
|
65
|
+
[word for x in dataset.df[dataset.text_column].str.split() for word in x]
|
66
|
+
)
|
67
|
+
most = counter.most_common()
|
68
|
+
|
69
|
+
def create_corpus(df, text_column):
|
70
|
+
corpus = []
|
71
|
+
for x in df[text_column].str.split():
|
72
|
+
for i in x:
|
73
|
+
corpus.append(i)
|
74
|
+
return corpus
|
75
|
+
|
76
|
+
corpus = create_corpus(dataset.df, text_column=dataset.text_column)
|
77
|
+
counter = Counter(corpus)
|
78
|
+
most = counter.most_common()
|
79
|
+
|
80
|
+
x = []
|
81
|
+
y = []
|
82
|
+
|
83
|
+
stop = set(stopwords.words("english"))
|
84
|
+
for word, count in most[:40]:
|
85
|
+
if word not in stop:
|
86
|
+
x.append(word)
|
87
|
+
y.append(count)
|
88
|
+
|
89
|
+
fig = go.Figure(data=[go.Bar(x=x, y=y, marker_color="#17C37B")])
|
90
|
+
fig.update_layout(
|
91
|
+
title="Most Common Words",
|
92
|
+
xaxis_title="Words",
|
93
|
+
yaxis_title="Frequency",
|
94
|
+
xaxis_tickangle=-45,
|
95
|
+
)
|
96
|
+
|
97
|
+
return fig
|
@@ -2,20 +2,18 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
"""
|
6
|
-
Threshold based tests
|
7
|
-
"""
|
8
|
-
|
9
5
|
import re
|
10
|
-
from dataclasses import dataclass
|
11
6
|
|
12
7
|
import plotly.graph_objects as go
|
13
8
|
|
14
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
10
|
+
from validmind.errors import SkipTestError
|
11
|
+
from validmind.vm_models import VMDataset
|
15
12
|
|
16
13
|
|
17
|
-
@
|
18
|
-
|
14
|
+
@tags("nlp", "text_data", "visualization", "frequency_analysis")
|
15
|
+
@tasks("text_classification", "text_summarization")
|
16
|
+
def Hashtags(dataset: VMDataset, top_hashtags: int = 25):
|
19
17
|
"""
|
20
18
|
Assesses hashtag frequency in a text column, highlighting usage trends and potential dataset bias or spam.
|
21
19
|
|
@@ -58,44 +56,24 @@ class Hashtags(ThresholdTest):
|
|
58
56
|
- Does not provide context or sentiment associated with the hashtags, so the information provided may have limited
|
59
57
|
utility on its own.
|
60
58
|
"""
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
print(f"temp: {temp}")
|
83
|
-
|
84
|
-
figures = []
|
85
|
-
if not temp.empty:
|
86
|
-
fig = go.Figure(data=[go.Bar(x=temp.index, y=temp.values)])
|
87
|
-
fig.update_layout(
|
88
|
-
title="Top Hashtags",
|
89
|
-
xaxis_title="Hashtag",
|
90
|
-
yaxis_title="Count",
|
91
|
-
xaxis_tickangle=-45,
|
92
|
-
)
|
93
|
-
figures.append(
|
94
|
-
Figure(
|
95
|
-
for_object=self,
|
96
|
-
key=self.name,
|
97
|
-
figure=fig,
|
98
|
-
)
|
99
|
-
)
|
100
|
-
|
101
|
-
return self.cache_results([], passed=True, figures=figures)
|
59
|
+
hashtags = (
|
60
|
+
dataset.df[dataset.text_column]
|
61
|
+
.apply(lambda x: re.findall(r"(?<=#)\w+", str(x)))
|
62
|
+
.explode()
|
63
|
+
)
|
64
|
+
top_hashtag_counts = hashtags.value_counts().head(top_hashtags)
|
65
|
+
|
66
|
+
if top_hashtag_counts.empty:
|
67
|
+
raise SkipTestError("No hashtags found in the dataset")
|
68
|
+
|
69
|
+
fig = go.Figure(
|
70
|
+
data=[go.Bar(x=top_hashtag_counts.index, y=top_hashtag_counts.values)]
|
71
|
+
)
|
72
|
+
fig.update_layout(
|
73
|
+
title="Top Hashtags",
|
74
|
+
xaxis_title="Hashtag",
|
75
|
+
yaxis_title="Count",
|
76
|
+
xaxis_tickangle=-45,
|
77
|
+
)
|
78
|
+
|
79
|
+
return fig
|
@@ -2,11 +2,6 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
"""
|
6
|
-
Metrics functions for any Pandas-compatible datasets
|
7
|
-
"""
|
8
|
-
|
9
|
-
|
10
5
|
import plotly.express as px
|
11
6
|
from langdetect import LangDetectException, detect
|
12
7
|
|
@@ -55,24 +50,23 @@ def LanguageDetection(dataset):
|
|
55
50
|
- The test returns "Unknown" for entries where language detection fails, which might mask underlying issues with
|
56
51
|
certain languages or text formats.
|
57
52
|
"""
|
58
|
-
# check text column
|
59
53
|
if not dataset.text_column:
|
60
|
-
raise ValueError(
|
54
|
+
raise ValueError(
|
55
|
+
"Please set the `text_column` option when "
|
56
|
+
"initializing your Dataset object to use this test"
|
57
|
+
)
|
61
58
|
|
62
|
-
# Function to detect language
|
63
59
|
def detect_language(text):
|
64
60
|
try:
|
65
61
|
return detect(text)
|
66
62
|
except LangDetectException:
|
67
|
-
return "Unknown"
|
63
|
+
return "Unknown"
|
68
64
|
|
69
|
-
# Applying the language detection function to each text entry
|
70
65
|
languages = dataset.df[dataset.text_column].apply(detect_language)
|
71
|
-
|
66
|
+
|
67
|
+
return px.histogram(
|
72
68
|
languages,
|
73
69
|
x=languages,
|
74
70
|
title="Language Distribution",
|
75
71
|
labels={"x": "Language Codes"},
|
76
72
|
)
|
77
|
-
|
78
|
-
return fig
|
@@ -2,21 +2,19 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
"""
|
6
|
-
Threshold based tests
|
7
|
-
"""
|
8
5
|
import re
|
9
|
-
from dataclasses import dataclass
|
10
6
|
|
11
|
-
import matplotlib.pyplot as plt
|
12
7
|
import pandas as pd
|
13
8
|
import plotly.express as px
|
14
9
|
|
15
|
-
from validmind
|
10
|
+
from validmind import tags, tasks
|
11
|
+
from validmind.errors import SkipTestError
|
12
|
+
from validmind.vm_models import VMDataset
|
16
13
|
|
17
14
|
|
18
|
-
@
|
19
|
-
|
15
|
+
@tags("nlp", "text_data", "visualization", "frequency_analysis")
|
16
|
+
@tasks("text_classification", "text_summarization")
|
17
|
+
def Mentions(dataset: VMDataset, top_mentions: int = 25):
|
20
18
|
"""
|
21
19
|
Calculates and visualizes frequencies of '@' prefixed mentions in a text-based dataset for NLP model analysis.
|
22
20
|
|
@@ -57,58 +55,29 @@ class Mentions(ThresholdTest):
|
|
57
55
|
- It does not provide insights on less frequently occurring data or outliers, which means potentially significant
|
58
56
|
patterns could be overlooked.
|
59
57
|
"""
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
self.inputs.dataset.df[text_column]
|
87
|
-
.apply(lambda x: mentions(x))
|
88
|
-
.value_counts()[:][1 : self.params["top_mentions"]]
|
89
|
-
.tolist()
|
90
|
-
)
|
91
|
-
row = pd.DataFrame({"scenario": []})
|
92
|
-
row["scenario"] = b
|
93
|
-
row["Percentage"] = a
|
94
|
-
figures = []
|
95
|
-
if not row.empty:
|
96
|
-
fig = px.treemap(
|
97
|
-
row, path=["scenario"], values="Percentage", title="Tree of Mentions"
|
98
|
-
)
|
99
|
-
figures.append(
|
100
|
-
Figure(
|
101
|
-
for_object=self,
|
102
|
-
key=self.name,
|
103
|
-
figure=fig,
|
104
|
-
)
|
105
|
-
)
|
106
|
-
|
107
|
-
# Do this if you want to prevent the figure from being displayed
|
108
|
-
plt.close("all")
|
109
|
-
|
110
|
-
return self.cache_results(
|
111
|
-
[],
|
112
|
-
passed=True,
|
113
|
-
figures=figures,
|
114
|
-
)
|
58
|
+
mention_counts = (
|
59
|
+
dataset.df[dataset.text_column]
|
60
|
+
.apply(lambda x: " ".join(re.findall(r"(?<=@)\w+", x)))
|
61
|
+
.value_counts()
|
62
|
+
)
|
63
|
+
|
64
|
+
if mention_counts.empty:
|
65
|
+
raise SkipTestError("No mentions found in the dataset")
|
66
|
+
|
67
|
+
start_index = 1 if mention_counts.iloc[0] == "" else 0
|
68
|
+
end_index = top_mentions + start_index
|
69
|
+
mention_counts = mention_counts[start_index:end_index]
|
70
|
+
|
71
|
+
mention_frequencies_df = pd.DataFrame(
|
72
|
+
{
|
73
|
+
"Scenario": mention_counts.index.tolist(),
|
74
|
+
"Percentage": mention_counts.tolist(),
|
75
|
+
}
|
76
|
+
)
|
77
|
+
|
78
|
+
return px.treemap(
|
79
|
+
mention_frequencies_df,
|
80
|
+
path=["Scenario"],
|
81
|
+
values="Percentage",
|
82
|
+
title="Tree of Mentions",
|
83
|
+
)
|
@@ -12,7 +12,7 @@ from validmind import tags, tasks
|
|
12
12
|
|
13
13
|
@tags("nlp", "text_data", "data_validation")
|
14
14
|
@tasks("nlp")
|
15
|
-
def PolarityAndSubjectivity(dataset):
|
15
|
+
def PolarityAndSubjectivity(dataset, threshold_subjectivity=0.5, threshold_polarity=0):
|
16
16
|
"""
|
17
17
|
Analyzes the polarity and subjectivity of text data within a given dataset to visualize the sentiment distribution.
|
18
18
|
|
@@ -50,23 +50,98 @@ def PolarityAndSubjectivity(dataset):
|
|
50
50
|
- Reliance on TextBlob which may not be accurate for all domains or contexts.
|
51
51
|
- Visualization could become cluttered with very large datasets, making interpretation difficult.
|
52
52
|
"""
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
53
|
+
# Check text column
|
54
|
+
if not dataset.text_column:
|
55
|
+
raise ValueError("Please set text_column name in the Validmind Dataset object")
|
56
|
+
|
57
|
+
sentiments = dataset.df[dataset.text_column].apply(lambda x: TextBlob(x).sentiment)
|
58
|
+
data = pd.DataFrame(
|
59
|
+
{
|
60
|
+
"polarity": [s.polarity for s in sentiments],
|
61
|
+
"subjectivity": [s.subjectivity for s in sentiments],
|
62
|
+
}
|
63
63
|
)
|
64
64
|
|
65
65
|
# Create a Plotly scatter plot
|
66
66
|
fig = px.scatter(
|
67
|
-
data,
|
67
|
+
data_frame=data,
|
68
|
+
x="polarity",
|
69
|
+
y="subjectivity",
|
70
|
+
title="Polarity vs Subjectivity",
|
68
71
|
)
|
69
72
|
fig.update_traces(textposition="top center")
|
70
|
-
fig.update_layout(xaxis_title="Polarity", yaxis_title="Subjectivity")
|
71
73
|
|
72
|
-
|
74
|
+
# Add threshold lines with names for legend
|
75
|
+
fig.add_hline(
|
76
|
+
y=threshold_subjectivity,
|
77
|
+
line_dash="dash",
|
78
|
+
line_color="gray",
|
79
|
+
opacity=0.5,
|
80
|
+
name=f"Subjectivity Threshold ({threshold_subjectivity})",
|
81
|
+
)
|
82
|
+
fig.add_vline(
|
83
|
+
x=threshold_polarity,
|
84
|
+
line_dash="dash",
|
85
|
+
line_color="gray",
|
86
|
+
opacity=0.5,
|
87
|
+
name=f"Polarity Threshold ({threshold_polarity})",
|
88
|
+
)
|
89
|
+
|
90
|
+
fig.update_layout(
|
91
|
+
xaxis_title="Polarity",
|
92
|
+
yaxis_title="Subjectivity",
|
93
|
+
xaxis=dict(range=[-1, 1]),
|
94
|
+
yaxis=dict(range=[0, 1]),
|
95
|
+
showlegend=True,
|
96
|
+
)
|
97
|
+
|
98
|
+
# Create Quadrant Distribution table
|
99
|
+
quadrant_df = pd.DataFrame(
|
100
|
+
{
|
101
|
+
"Quadrant": [
|
102
|
+
"Subjective - Positive Sentiment",
|
103
|
+
"Subjective - Negative Sentiment",
|
104
|
+
"Objective - Positive Sentiment",
|
105
|
+
"Objective - Negative Sentiment",
|
106
|
+
],
|
107
|
+
"Ratio (%)": [
|
108
|
+
(
|
109
|
+
(data["polarity"] >= threshold_polarity)
|
110
|
+
& (data["subjectivity"] >= threshold_subjectivity)
|
111
|
+
).mean()
|
112
|
+
* 100,
|
113
|
+
(
|
114
|
+
(data["polarity"] < threshold_polarity)
|
115
|
+
& (data["subjectivity"] >= threshold_subjectivity)
|
116
|
+
).mean()
|
117
|
+
* 100,
|
118
|
+
(
|
119
|
+
(data["polarity"] >= threshold_polarity)
|
120
|
+
& (data["subjectivity"] < threshold_subjectivity)
|
121
|
+
).mean()
|
122
|
+
* 100,
|
123
|
+
(
|
124
|
+
(data["polarity"] < threshold_polarity)
|
125
|
+
& (data["subjectivity"] < threshold_subjectivity)
|
126
|
+
).mean()
|
127
|
+
* 100,
|
128
|
+
],
|
129
|
+
}
|
130
|
+
)
|
131
|
+
|
132
|
+
# Create Statistics table
|
133
|
+
stats_df = pd.DataFrame(
|
134
|
+
{
|
135
|
+
"Metric": ["Polarity", "Subjectivity"],
|
136
|
+
"Range": ["[-1, 1]", "[0, 1]"],
|
137
|
+
"Mean": [data["polarity"].mean(), data["subjectivity"].mean()],
|
138
|
+
"Median": [data["polarity"].median(), data["subjectivity"].median()],
|
139
|
+
"Std": [data["polarity"].std(), data["subjectivity"].std()],
|
140
|
+
"Min": [data["polarity"].min(), data["subjectivity"].min()],
|
141
|
+
"Max": [data["polarity"].max(), data["subjectivity"].max()],
|
142
|
+
}
|
143
|
+
)
|
144
|
+
|
145
|
+
statistics_tables = {"Quadrant Distribution": quadrant_df, "Statistics": stats_df}
|
146
|
+
|
147
|
+
return fig, statistics_tables
|
@@ -8,15 +8,15 @@ Metrics functions for any Pandas-compatible datasets
|
|
8
8
|
|
9
9
|
import string
|
10
10
|
from collections import defaultdict
|
11
|
-
from dataclasses import dataclass
|
12
11
|
|
13
|
-
import
|
12
|
+
import plotly.graph_objects as go
|
14
13
|
|
15
|
-
from validmind
|
14
|
+
from validmind import tags, tasks
|
16
15
|
|
17
16
|
|
18
|
-
@
|
19
|
-
|
17
|
+
@tags("nlp", "text_data", "visualization", "frequency_analysis")
|
18
|
+
@tasks("text_classification", "text_summarization", "nlp")
|
19
|
+
def Punctuations(dataset, count_mode="token"):
|
20
20
|
"""
|
21
21
|
Analyzes and visualizes the frequency distribution of punctuation usage in a given text dataset.
|
22
22
|
|
@@ -28,10 +28,11 @@ class Punctuations(Metric):
|
|
28
28
|
|
29
29
|
### Test Mechanism
|
30
30
|
|
31
|
-
The test begins by verifying that the input "dataset" is of the type VMDataset.
|
32
|
-
|
33
|
-
|
34
|
-
|
31
|
+
The test begins by verifying that the input "dataset" is of the type VMDataset. The count_mode parameter must be
|
32
|
+
either "token" (counts punctuation marks as individual tokens) or "word" (counts punctuation marks within words).
|
33
|
+
Following that, a corpus is created from the dataset by splitting its text on spaces. Each unique punctuation
|
34
|
+
character in the text corpus is then tallied. The frequency distribution of each punctuation symbol is visualized
|
35
|
+
as a bar graph, with these results being stored as Figures and associated with the main Punctuations object.
|
35
36
|
|
36
37
|
### Signs of High Risk
|
37
38
|
|
@@ -53,45 +54,60 @@ class Punctuations(Metric):
|
|
53
54
|
- Less effective with languages that use non-standard or different punctuation.
|
54
55
|
- Visualization may lack interpretability when there are many unique punctuation marks in the dataset.
|
55
56
|
"""
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
57
|
+
# Check text column
|
58
|
+
if not dataset.text_column:
|
59
|
+
raise ValueError("Please set text_column name in the Validmind Dataset object")
|
60
|
+
|
61
|
+
if count_mode not in ["token", "word"]:
|
62
|
+
raise ValueError("count_mode parameter must be either 'token' or 'word'")
|
63
|
+
|
64
|
+
corpus = _create_corpus(dataset.df, dataset.text_column)
|
65
|
+
punctuation_counts = _count_punctuations(corpus, count_mode)
|
66
|
+
return _create_punctuation_plot(punctuation_counts)
|
67
|
+
|
68
|
+
|
69
|
+
def _create_punctuation_plot(punctuation_counts):
|
70
|
+
"""Create a bar plot visualization of punctuation frequencies."""
|
71
|
+
fig = go.Figure(
|
72
|
+
data=[
|
73
|
+
go.Bar(
|
74
|
+
x=list(punctuation_counts.keys()),
|
75
|
+
y=list(punctuation_counts.values()),
|
76
|
+
marker_color="#17C37B",
|
77
|
+
)
|
78
|
+
]
|
79
|
+
)
|
80
|
+
fig.update_layout(
|
81
|
+
title="Punctuation Distribution",
|
82
|
+
xaxis_title="Punctuation Marks",
|
83
|
+
yaxis_title="Frequency",
|
84
|
+
showlegend=False,
|
85
|
+
)
|
86
|
+
return fig
|
87
|
+
|
88
|
+
|
89
|
+
def _create_corpus(df, text_column):
|
90
|
+
"""Create a corpus from the dataset's text column."""
|
91
|
+
corpus = []
|
92
|
+
for x in df[text_column].str.split():
|
93
|
+
for i in x:
|
94
|
+
corpus.append(i)
|
95
|
+
return corpus
|
96
|
+
|
97
|
+
|
98
|
+
def _count_punctuations(corpus, count_mode="token"):
|
99
|
+
"""Count punctuation marks in the corpus based on the specified mode."""
|
100
|
+
special = string.punctuation
|
101
|
+
dic = defaultdict(int, {key: 0 for key in special})
|
102
|
+
|
103
|
+
if count_mode == "token":
|
79
104
|
for i in corpus:
|
80
105
|
if i in special:
|
81
106
|
dic[i] += 1
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
figures.append(
|
88
|
-
Figure(
|
89
|
-
for_object=self,
|
90
|
-
key=self.key,
|
91
|
-
figure=fig,
|
92
|
-
)
|
93
|
-
)
|
94
|
-
# Do this if you want to prevent the figure from being displayed
|
95
|
-
plt.close("all")
|
107
|
+
else: # count_mode == "word"
|
108
|
+
for word in corpus:
|
109
|
+
for char in word:
|
110
|
+
if char in special:
|
111
|
+
dic[char] += 1
|
96
112
|
|
97
|
-
|
113
|
+
return dic
|
@@ -47,6 +47,10 @@ def Sentiment(dataset):
|
|
47
47
|
- Relies heavily on the accuracy of the VADER sentiment analysis tool.
|
48
48
|
- Visualization alone may not provide comprehensive insights into underlying causes of sentiment distribution.
|
49
49
|
"""
|
50
|
+
# Check text column
|
51
|
+
if not dataset.text_column:
|
52
|
+
raise ValueError("Please set text_column name in the Validmind Dataset object")
|
53
|
+
|
50
54
|
nltk.download("vader_lexicon", quiet=True)
|
51
55
|
# Initialize VADER
|
52
56
|
sia = SentimentIntensityAnalyzer()
|