validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -7,26 +7,21 @@ Threshold based tests
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
from collections import defaultdict
|
10
|
-
from dataclasses import dataclass
|
11
|
-
from typing import List
|
12
10
|
|
13
|
-
import matplotlib.pyplot as plt
|
14
11
|
import nltk
|
15
12
|
import pandas as pd
|
13
|
+
import plotly.graph_objects as go
|
16
14
|
from nltk.corpus import stopwords
|
17
15
|
|
18
|
-
from validmind
|
19
|
-
|
20
|
-
ResultSummary,
|
21
|
-
ResultTable,
|
22
|
-
ResultTableMetadata,
|
23
|
-
ThresholdTest,
|
24
|
-
ThresholdTestResult,
|
25
|
-
)
|
16
|
+
from validmind import tags, tasks
|
17
|
+
from validmind.vm_models import VMDataset
|
26
18
|
|
27
19
|
|
28
|
-
@
|
29
|
-
|
20
|
+
@tags("nlp", "text_data", "frequency_analysis", "visualization")
|
21
|
+
@tasks("text_classification", "text_summarization")
|
22
|
+
def StopWords(
|
23
|
+
dataset: VMDataset, min_percent_threshold: float = 0.5, num_words: int = 25
|
24
|
+
):
|
30
25
|
"""
|
31
26
|
Evaluates and visualizes the frequency of English stop words in a text dataset against a defined threshold.
|
32
27
|
|
@@ -75,82 +70,58 @@ class StopWords(ThresholdTest):
|
|
75
70
|
or predictive accuracy.
|
76
71
|
"""
|
77
72
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
def run(self):
|
100
|
-
text_column = self.inputs.dataset.text_column
|
101
|
-
|
102
|
-
def create_corpus(df, text_column):
|
103
|
-
corpus = []
|
104
|
-
for x in df[text_column].str.split():
|
105
|
-
for i in x:
|
106
|
-
corpus.append(i)
|
107
|
-
return corpus
|
108
|
-
|
109
|
-
corpus = create_corpus(self.inputs.dataset.df, text_column=text_column)
|
110
|
-
|
111
|
-
nltk.download("stopwords")
|
112
|
-
stop = set(stopwords.words("english"))
|
113
|
-
dic = defaultdict(int)
|
114
|
-
for word in corpus:
|
115
|
-
if word in stop:
|
116
|
-
dic[word] += 1
|
117
|
-
# Calculate the total number of words in the corpus
|
118
|
-
total_words = len(corpus)
|
119
|
-
|
120
|
-
# Calculate the percentage of each word in the corpus
|
121
|
-
word_percentages = {}
|
122
|
-
for word, count in dic.items():
|
123
|
-
percentage = (count / total_words) * 100
|
124
|
-
word_percentages[word] = percentage
|
125
|
-
|
126
|
-
passed = all(word_percentages.values()) < self.params["min_percent_threshold"]
|
127
|
-
top = sorted(word_percentages.items(), key=lambda x: x[1], reverse=True)[
|
128
|
-
: self.params["num_words"]
|
129
|
-
]
|
130
|
-
|
131
|
-
test_results = [
|
132
|
-
ThresholdTestResult(
|
133
|
-
passed=passed,
|
134
|
-
values=top,
|
135
|
-
)
|
136
|
-
]
|
137
|
-
figures = []
|
138
|
-
if top:
|
139
|
-
fig, _ = plt.subplots()
|
140
|
-
x, y = zip(*top)
|
141
|
-
plt.bar(x, y)
|
142
|
-
plt.xticks(rotation=90)
|
143
|
-
|
144
|
-
# Do this if you want to prevent the figure from being displayed
|
145
|
-
plt.close("all")
|
146
|
-
|
147
|
-
figures = []
|
148
|
-
figures.append(
|
149
|
-
Figure(
|
150
|
-
for_object=self,
|
151
|
-
key=f"{self.name}",
|
152
|
-
figure=fig,
|
153
|
-
)
|
154
|
-
)
|
73
|
+
text_column = dataset.text_column
|
74
|
+
|
75
|
+
def create_corpus(df, text_column):
|
76
|
+
corpus = []
|
77
|
+
for x in df[text_column].str.split():
|
78
|
+
for i in x:
|
79
|
+
corpus.append(i)
|
80
|
+
return corpus
|
81
|
+
|
82
|
+
corpus = create_corpus(dataset.df, text_column=text_column)
|
83
|
+
|
84
|
+
nltk.download("stopwords", quiet=True)
|
85
|
+
|
86
|
+
stop = set(stopwords.words("english"))
|
87
|
+
dic = defaultdict(int)
|
88
|
+
for word in corpus:
|
89
|
+
if word in stop:
|
90
|
+
dic[word] += 1
|
91
|
+
|
92
|
+
# Calculate the total number of words in the corpus
|
93
|
+
total_words = len(corpus)
|
155
94
|
|
156
|
-
|
95
|
+
# Calculate the percentage of each word in the corpus
|
96
|
+
word_percentages = {}
|
97
|
+
for word, count in dic.items():
|
98
|
+
percentage = (count / total_words) * 100
|
99
|
+
word_percentages[word] = percentage
|
100
|
+
|
101
|
+
passed = all(word_percentages.values()) < min_percent_threshold
|
102
|
+
results = sorted(word_percentages.items(), key=lambda x: x[1], reverse=True)[
|
103
|
+
:num_words
|
104
|
+
]
|
105
|
+
|
106
|
+
if not results:
|
107
|
+
return passed
|
108
|
+
|
109
|
+
x, y = zip(*results)
|
110
|
+
|
111
|
+
fig = go.Figure(data=[go.Bar(x=x, y=y)])
|
112
|
+
fig.update_layout(
|
113
|
+
title=f"Stop Words Frequency in '{text_column}'",
|
114
|
+
xaxis_title="Stop Words",
|
115
|
+
yaxis_title="Percentage (%)",
|
116
|
+
xaxis_tickangle=-45,
|
117
|
+
)
|
118
|
+
|
119
|
+
return (
|
120
|
+
{
|
121
|
+
f"Stop words results for column '{text_column}'": pd.DataFrame(
|
122
|
+
results, columns=["Word", "Percentage"]
|
123
|
+
)
|
124
|
+
},
|
125
|
+
fig,
|
126
|
+
passed,
|
127
|
+
)
|
@@ -3,19 +3,98 @@
|
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
5
|
import string
|
6
|
-
from dataclasses import dataclass
|
7
6
|
|
8
|
-
import matplotlib.pyplot as plt
|
9
7
|
import nltk
|
10
8
|
import pandas as pd
|
11
9
|
import plotly.express as px
|
12
10
|
from nltk.corpus import stopwords
|
13
11
|
|
14
|
-
from
|
12
|
+
from validmind import tags, tasks
|
13
|
+
from validmind.vm_models import VMDataset
|
15
14
|
|
16
15
|
|
17
|
-
|
18
|
-
|
16
|
+
def create_metrics_df(df, text_column, unwanted_tokens, lang):
|
17
|
+
stop_words = set(word.lower() for word in stopwords.words(lang))
|
18
|
+
unwanted_tokens = set(token.lower() for token in unwanted_tokens)
|
19
|
+
|
20
|
+
results = []
|
21
|
+
|
22
|
+
for text in df[text_column]:
|
23
|
+
# pre-process text
|
24
|
+
words = nltk.word_tokenize(text)
|
25
|
+
filtered_words = [
|
26
|
+
word
|
27
|
+
for word in words
|
28
|
+
if word.lower() not in stop_words
|
29
|
+
and word.lower() not in unwanted_tokens
|
30
|
+
and word not in string.punctuation
|
31
|
+
]
|
32
|
+
sentences = nltk.sent_tokenize(text)
|
33
|
+
|
34
|
+
# calculate metrics
|
35
|
+
total_words = len(filtered_words)
|
36
|
+
total_sentences = len(sentences)
|
37
|
+
avg_sentence_length = round(
|
38
|
+
(
|
39
|
+
sum(len(sentence.split()) for sentence in sentences) / total_sentences
|
40
|
+
if total_sentences
|
41
|
+
else 0
|
42
|
+
),
|
43
|
+
1,
|
44
|
+
)
|
45
|
+
total_paragraphs = len(text.split("\n\n"))
|
46
|
+
total_unique_words = len(set(filtered_words))
|
47
|
+
total_punctuations = sum(1 for word in words if word in string.punctuation)
|
48
|
+
lexical_diversity = round(
|
49
|
+
total_unique_words / len(filtered_words) if filtered_words else 0, 1
|
50
|
+
)
|
51
|
+
|
52
|
+
results.append(
|
53
|
+
[
|
54
|
+
total_words,
|
55
|
+
total_sentences,
|
56
|
+
avg_sentence_length,
|
57
|
+
total_paragraphs,
|
58
|
+
total_unique_words,
|
59
|
+
total_punctuations,
|
60
|
+
lexical_diversity,
|
61
|
+
]
|
62
|
+
)
|
63
|
+
|
64
|
+
return pd.DataFrame(
|
65
|
+
results,
|
66
|
+
columns=[
|
67
|
+
"Total Words",
|
68
|
+
"Total Sentences",
|
69
|
+
"Avg Sentence Length",
|
70
|
+
"Total Paragraphs",
|
71
|
+
"Total Unique Words",
|
72
|
+
"Total Punctuations",
|
73
|
+
"Lexical Diversity",
|
74
|
+
],
|
75
|
+
)
|
76
|
+
|
77
|
+
|
78
|
+
@tags("nlp", "text_data", "visualization")
|
79
|
+
@tasks("text_classification", "text_summarization")
|
80
|
+
def TextDescription(
|
81
|
+
dataset: VMDataset,
|
82
|
+
unwanted_tokens: set = {
|
83
|
+
"s",
|
84
|
+
"s'",
|
85
|
+
"mr",
|
86
|
+
"ms",
|
87
|
+
"mrs",
|
88
|
+
"dr",
|
89
|
+
"'s",
|
90
|
+
" ",
|
91
|
+
"''",
|
92
|
+
"dollar",
|
93
|
+
"us",
|
94
|
+
"``",
|
95
|
+
},
|
96
|
+
lang: str = "english",
|
97
|
+
):
|
19
98
|
"""
|
20
99
|
Conducts comprehensive textual analysis on a dataset using NLTK to evaluate various parameters and generate
|
21
100
|
visualizations.
|
@@ -60,160 +139,38 @@ class TextDescription(Metric):
|
|
60
139
|
- Assumes well-structured documents, which may result in inaccuracies with poorly formatted text.
|
61
140
|
"""
|
62
141
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
total_words = len(words)
|
95
|
-
total_sentences = len(sentences)
|
96
|
-
avg_sentence_length = round(
|
97
|
-
(
|
98
|
-
sum(len(sentence.split()) for sentence in sentences)
|
99
|
-
/ total_sentences
|
100
|
-
if total_sentences
|
101
|
-
else 0
|
102
|
-
),
|
103
|
-
1,
|
142
|
+
if dataset.text_column is None:
|
143
|
+
raise ValueError("A 'text_column' must be provided to run this test.")
|
144
|
+
|
145
|
+
nltk.download("punkt_tab", quiet=True)
|
146
|
+
|
147
|
+
metrics_df = create_metrics_df(
|
148
|
+
dataset.df, dataset.text_column, unwanted_tokens, lang
|
149
|
+
)
|
150
|
+
|
151
|
+
combinations_to_plot = [
|
152
|
+
("Total Words", "Total Sentences"),
|
153
|
+
("Total Words", "Total Unique Words"),
|
154
|
+
("Total Sentences", "Avg Sentence Length"),
|
155
|
+
("Total Unique Words", "Lexical Diversity"),
|
156
|
+
]
|
157
|
+
|
158
|
+
figures = []
|
159
|
+
|
160
|
+
# Create hist plots for each column
|
161
|
+
for column in metrics_df.columns:
|
162
|
+
fig = px.histogram(metrics_df, x=column)
|
163
|
+
fig.update_layout(bargap=0.2)
|
164
|
+
figures.append(fig)
|
165
|
+
|
166
|
+
for metric1, metric2 in combinations_to_plot:
|
167
|
+
figures.append(
|
168
|
+
px.scatter(
|
169
|
+
metrics_df,
|
170
|
+
x=metric1,
|
171
|
+
y=metric2,
|
172
|
+
title=f"Scatter Plot: {metric1} vs {metric2}",
|
104
173
|
)
|
105
|
-
total_paragraphs = len(paragraphs)
|
106
|
-
|
107
|
-
results.append(
|
108
|
-
[total_words, total_sentences, avg_sentence_length, total_paragraphs]
|
109
|
-
)
|
110
|
-
|
111
|
-
return pd.DataFrame(
|
112
|
-
results,
|
113
|
-
columns=[
|
114
|
-
"Total Words",
|
115
|
-
"Total Sentences",
|
116
|
-
"Avg Sentence Length",
|
117
|
-
"Total Paragraphs",
|
118
|
-
],
|
119
174
|
)
|
120
175
|
|
121
|
-
|
122
|
-
self, df, text_column, unwanted_tokens, num_top_words, lang
|
123
|
-
):
|
124
|
-
stop_words = set(word.lower() for word in stopwords.words(lang))
|
125
|
-
unwanted_tokens = set(token.lower() for token in unwanted_tokens)
|
126
|
-
|
127
|
-
results = []
|
128
|
-
|
129
|
-
for text in df[text_column]:
|
130
|
-
words = nltk.word_tokenize(text)
|
131
|
-
|
132
|
-
filtered_words = [
|
133
|
-
word
|
134
|
-
for word in words
|
135
|
-
if word.lower() not in stop_words
|
136
|
-
and word.lower() not in unwanted_tokens
|
137
|
-
and word not in string.punctuation
|
138
|
-
]
|
139
|
-
|
140
|
-
total_unique_words = len(set(filtered_words))
|
141
|
-
total_punctuations = sum(1 for word in words if word in string.punctuation)
|
142
|
-
lexical_diversity = round(
|
143
|
-
total_unique_words / len(filtered_words) if filtered_words else 0, 1
|
144
|
-
)
|
145
|
-
|
146
|
-
results.append([total_unique_words, total_punctuations, lexical_diversity])
|
147
|
-
|
148
|
-
return pd.DataFrame(
|
149
|
-
results,
|
150
|
-
columns=["Total Unique Words", "Total Punctuations", "Lexical Diversity"],
|
151
|
-
)
|
152
|
-
|
153
|
-
# Wrapper function that combines the outputs
|
154
|
-
def text_description_table(self, df, params):
|
155
|
-
text_column = self.inputs.dataset.text_column
|
156
|
-
unwanted_tokens = params["unwanted_tokens"]
|
157
|
-
num_top_words = params["num_top_words"]
|
158
|
-
lang = params["lang"]
|
159
|
-
|
160
|
-
gen_metrics_df = self.general_text_metrics(df, text_column)
|
161
|
-
vocab_metrics_df = self.vocabulary_structure_metrics(
|
162
|
-
df, text_column, unwanted_tokens, num_top_words, lang
|
163
|
-
)
|
164
|
-
combined_df = pd.concat([gen_metrics_df, vocab_metrics_df], axis=1)
|
165
|
-
|
166
|
-
return combined_df
|
167
|
-
|
168
|
-
def run(self):
|
169
|
-
# Enforce that text_column must be provided as part of the params
|
170
|
-
if self.inputs.dataset.text_column is None:
|
171
|
-
raise ValueError("A 'text_column' must be provided to run this test.")
|
172
|
-
|
173
|
-
# Can only run this test if we have a Dataset object
|
174
|
-
if not isinstance(self.inputs.dataset, VMDataset):
|
175
|
-
raise ValueError("TextDescription requires a validmind Dataset object")
|
176
|
-
|
177
|
-
# download nltk data
|
178
|
-
nltk.download("punkt_tab", quiet=True)
|
179
|
-
|
180
|
-
df_text_description = self.text_description_table(
|
181
|
-
self.inputs.dataset.df, self.params
|
182
|
-
)
|
183
|
-
|
184
|
-
# Define the combinations you want to plot
|
185
|
-
combinations_to_plot = [
|
186
|
-
("Total Words", "Total Sentences"),
|
187
|
-
("Total Words", "Total Unique Words"),
|
188
|
-
("Total Sentences", "Avg Sentence Length"),
|
189
|
-
("Total Unique Words", "Lexical Diversity"),
|
190
|
-
]
|
191
|
-
params = {"combinations_to_plot": combinations_to_plot}
|
192
|
-
figures = self.text_description_plots(df_text_description, params)
|
193
|
-
|
194
|
-
return self.cache_results(
|
195
|
-
figures=figures,
|
196
|
-
)
|
197
|
-
|
198
|
-
# Function to plot scatter plots for specified combinations using Plotly
|
199
|
-
def text_description_plots(self, df, params):
|
200
|
-
combinations_to_plot = params["combinations_to_plot"]
|
201
|
-
figures = []
|
202
|
-
# Create hist plots for each column
|
203
|
-
for i, column in enumerate(df.columns):
|
204
|
-
fig = px.histogram(df, x=column)
|
205
|
-
fig.update_layout(bargap=0.2)
|
206
|
-
# Generate a unique key for each histogram using the column name and index
|
207
|
-
histogram_key = f"{self.name}_histogram_{column}_{i}"
|
208
|
-
figures.append(Figure(for_object=self, key=histogram_key, figure=fig))
|
209
|
-
|
210
|
-
for j, (metric1, metric2) in enumerate(combinations_to_plot):
|
211
|
-
fig = px.scatter(
|
212
|
-
df, x=metric1, y=metric2, title=f"Scatter Plot: {metric1} vs {metric2}"
|
213
|
-
)
|
214
|
-
# Generate a unique key for each scatter plot using the metric names and index
|
215
|
-
scatter_key = f"{self.name}_scatter_{metric1}_vs_{metric2}_{j}"
|
216
|
-
figures.append(Figure(for_object=self, key=scatter_key, figure=fig))
|
217
|
-
plt.close("all")
|
218
|
-
|
219
|
-
return figures
|
176
|
+
return tuple(figures)
|
@@ -49,9 +49,15 @@ def Toxicity(dataset):
|
|
49
49
|
- Does not provide context-specific insights, which may be necessary for nuanced understanding.
|
50
50
|
- May not capture all forms of subtle or indirect toxic language.
|
51
51
|
"""
|
52
|
+
|
53
|
+
# Check text column
|
54
|
+
if not dataset.text_column:
|
55
|
+
raise ValueError("Please set text_column name in the Validmind Dataset object")
|
56
|
+
|
57
|
+
text_inputs = dataset.df[dataset.text_column].tolist()
|
58
|
+
|
52
59
|
toxicity = evaluate.load("toxicity")
|
53
|
-
|
54
|
-
toxicity_scores = toxicity.compute(predictions=list(input_text.values))["toxicity"]
|
60
|
+
toxicity_scores = toxicity.compute(predictions=text_inputs)["toxicity"]
|
55
61
|
|
56
62
|
fig = plt.figure()
|
57
63
|
ax = sns.kdeplot(
|
@@ -62,7 +68,9 @@ def Toxicity(dataset):
|
|
62
68
|
alpha=0.5,
|
63
69
|
linewidth=0,
|
64
70
|
)
|
65
|
-
ax.set_title(f"Toxicity score of {dataset.text_column}
|
71
|
+
ax.set_title(f"Toxicity score of {dataset.text_column}")
|
66
72
|
ax.set_xlabel("Toxicity score")
|
67
|
-
|
73
|
+
|
74
|
+
plt.close()
|
75
|
+
|
68
76
|
return fig
|