validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -7,26 +7,21 @@ Threshold based tests
7
7
  """
8
8
 
9
9
  from collections import defaultdict
10
- from dataclasses import dataclass
11
- from typing import List
12
10
 
13
- import matplotlib.pyplot as plt
14
11
  import nltk
15
12
  import pandas as pd
13
+ import plotly.graph_objects as go
16
14
  from nltk.corpus import stopwords
17
15
 
18
- from validmind.vm_models import (
19
- Figure,
20
- ResultSummary,
21
- ResultTable,
22
- ResultTableMetadata,
23
- ThresholdTest,
24
- ThresholdTestResult,
25
- )
16
+ from validmind import tags, tasks
17
+ from validmind.vm_models import VMDataset
26
18
 
27
19
 
28
- @dataclass
29
- class StopWords(ThresholdTest):
20
+ @tags("nlp", "text_data", "frequency_analysis", "visualization")
21
+ @tasks("text_classification", "text_summarization")
22
+ def StopWords(
23
+ dataset: VMDataset, min_percent_threshold: float = 0.5, num_words: int = 25
24
+ ):
30
25
  """
31
26
  Evaluates and visualizes the frequency of English stop words in a text dataset against a defined threshold.
32
27
 
@@ -75,82 +70,58 @@ class StopWords(ThresholdTest):
75
70
  or predictive accuracy.
76
71
  """
77
72
 
78
- name = "stop_words"
79
- required_inputs = ["dataset"]
80
- default_params = {"min_percent_threshold": 0.5, "num_words": 25}
81
- tasks = ["text_classification", "text_summarization"]
82
- tags = ["nlp", "text_data", "visualization", "frequency_analysis"]
83
-
84
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
85
- # Create a DataFrame from the data
86
- df = pd.DataFrame(results[0].values, columns=["Word", "Percentage"])
87
-
88
- return ResultSummary(
89
- results=[
90
- ResultTable(
91
- data=df,
92
- metadata=ResultTableMetadata(
93
- title=f"Stop words results for column '{self.inputs.dataset.target_column}'"
94
- ),
95
- )
96
- ]
97
- )
98
-
99
- def run(self):
100
- text_column = self.inputs.dataset.text_column
101
-
102
- def create_corpus(df, text_column):
103
- corpus = []
104
- for x in df[text_column].str.split():
105
- for i in x:
106
- corpus.append(i)
107
- return corpus
108
-
109
- corpus = create_corpus(self.inputs.dataset.df, text_column=text_column)
110
-
111
- nltk.download("stopwords")
112
- stop = set(stopwords.words("english"))
113
- dic = defaultdict(int)
114
- for word in corpus:
115
- if word in stop:
116
- dic[word] += 1
117
- # Calculate the total number of words in the corpus
118
- total_words = len(corpus)
119
-
120
- # Calculate the percentage of each word in the corpus
121
- word_percentages = {}
122
- for word, count in dic.items():
123
- percentage = (count / total_words) * 100
124
- word_percentages[word] = percentage
125
-
126
- passed = all(word_percentages.values()) < self.params["min_percent_threshold"]
127
- top = sorted(word_percentages.items(), key=lambda x: x[1], reverse=True)[
128
- : self.params["num_words"]
129
- ]
130
-
131
- test_results = [
132
- ThresholdTestResult(
133
- passed=passed,
134
- values=top,
135
- )
136
- ]
137
- figures = []
138
- if top:
139
- fig, _ = plt.subplots()
140
- x, y = zip(*top)
141
- plt.bar(x, y)
142
- plt.xticks(rotation=90)
143
-
144
- # Do this if you want to prevent the figure from being displayed
145
- plt.close("all")
146
-
147
- figures = []
148
- figures.append(
149
- Figure(
150
- for_object=self,
151
- key=f"{self.name}",
152
- figure=fig,
153
- )
154
- )
73
+ text_column = dataset.text_column
74
+
75
+ def create_corpus(df, text_column):
76
+ corpus = []
77
+ for x in df[text_column].str.split():
78
+ for i in x:
79
+ corpus.append(i)
80
+ return corpus
81
+
82
+ corpus = create_corpus(dataset.df, text_column=text_column)
83
+
84
+ nltk.download("stopwords", quiet=True)
85
+
86
+ stop = set(stopwords.words("english"))
87
+ dic = defaultdict(int)
88
+ for word in corpus:
89
+ if word in stop:
90
+ dic[word] += 1
91
+
92
+ # Calculate the total number of words in the corpus
93
+ total_words = len(corpus)
155
94
 
156
- return self.cache_results(test_results, passed=passed, figures=figures)
95
+ # Calculate the percentage of each word in the corpus
96
+ word_percentages = {}
97
+ for word, count in dic.items():
98
+ percentage = (count / total_words) * 100
99
+ word_percentages[word] = percentage
100
+
101
+ passed = all(word_percentages.values()) < min_percent_threshold
102
+ results = sorted(word_percentages.items(), key=lambda x: x[1], reverse=True)[
103
+ :num_words
104
+ ]
105
+
106
+ if not results:
107
+ return passed
108
+
109
+ x, y = zip(*results)
110
+
111
+ fig = go.Figure(data=[go.Bar(x=x, y=y)])
112
+ fig.update_layout(
113
+ title=f"Stop Words Frequency in '{text_column}'",
114
+ xaxis_title="Stop Words",
115
+ yaxis_title="Percentage (%)",
116
+ xaxis_tickangle=-45,
117
+ )
118
+
119
+ return (
120
+ {
121
+ f"Stop words results for column '{text_column}'": pd.DataFrame(
122
+ results, columns=["Word", "Percentage"]
123
+ )
124
+ },
125
+ fig,
126
+ passed,
127
+ )
@@ -3,19 +3,98 @@
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
5
  import string
6
- from dataclasses import dataclass
7
6
 
8
- import matplotlib.pyplot as plt
9
7
  import nltk
10
8
  import pandas as pd
11
9
  import plotly.express as px
12
10
  from nltk.corpus import stopwords
13
11
 
14
- from ....vm_models import Figure, Metric, VMDataset
12
+ from validmind import tags, tasks
13
+ from validmind.vm_models import VMDataset
15
14
 
16
15
 
17
- @dataclass
18
- class TextDescription(Metric):
16
+ def create_metrics_df(df, text_column, unwanted_tokens, lang):
17
+ stop_words = set(word.lower() for word in stopwords.words(lang))
18
+ unwanted_tokens = set(token.lower() for token in unwanted_tokens)
19
+
20
+ results = []
21
+
22
+ for text in df[text_column]:
23
+ # pre-process text
24
+ words = nltk.word_tokenize(text)
25
+ filtered_words = [
26
+ word
27
+ for word in words
28
+ if word.lower() not in stop_words
29
+ and word.lower() not in unwanted_tokens
30
+ and word not in string.punctuation
31
+ ]
32
+ sentences = nltk.sent_tokenize(text)
33
+
34
+ # calculate metrics
35
+ total_words = len(filtered_words)
36
+ total_sentences = len(sentences)
37
+ avg_sentence_length = round(
38
+ (
39
+ sum(len(sentence.split()) for sentence in sentences) / total_sentences
40
+ if total_sentences
41
+ else 0
42
+ ),
43
+ 1,
44
+ )
45
+ total_paragraphs = len(text.split("\n\n"))
46
+ total_unique_words = len(set(filtered_words))
47
+ total_punctuations = sum(1 for word in words if word in string.punctuation)
48
+ lexical_diversity = round(
49
+ total_unique_words / len(filtered_words) if filtered_words else 0, 1
50
+ )
51
+
52
+ results.append(
53
+ [
54
+ total_words,
55
+ total_sentences,
56
+ avg_sentence_length,
57
+ total_paragraphs,
58
+ total_unique_words,
59
+ total_punctuations,
60
+ lexical_diversity,
61
+ ]
62
+ )
63
+
64
+ return pd.DataFrame(
65
+ results,
66
+ columns=[
67
+ "Total Words",
68
+ "Total Sentences",
69
+ "Avg Sentence Length",
70
+ "Total Paragraphs",
71
+ "Total Unique Words",
72
+ "Total Punctuations",
73
+ "Lexical Diversity",
74
+ ],
75
+ )
76
+
77
+
78
+ @tags("nlp", "text_data", "visualization")
79
+ @tasks("text_classification", "text_summarization")
80
+ def TextDescription(
81
+ dataset: VMDataset,
82
+ unwanted_tokens: set = {
83
+ "s",
84
+ "s'",
85
+ "mr",
86
+ "ms",
87
+ "mrs",
88
+ "dr",
89
+ "'s",
90
+ " ",
91
+ "''",
92
+ "dollar",
93
+ "us",
94
+ "``",
95
+ },
96
+ lang: str = "english",
97
+ ):
19
98
  """
20
99
  Conducts comprehensive textual analysis on a dataset using NLTK to evaluate various parameters and generate
21
100
  visualizations.
@@ -60,160 +139,38 @@ class TextDescription(Metric):
60
139
  - Assumes well-structured documents, which may result in inaccuracies with poorly formatted text.
61
140
  """
62
141
 
63
- name = "text_description"
64
- required_inputs = ["dataset"]
65
- default_params = {
66
- "unwanted_tokens": {
67
- "s",
68
- "s'",
69
- "mr",
70
- "ms",
71
- "mrs",
72
- "dr",
73
- "'s",
74
- " ",
75
- "''",
76
- "dollar",
77
- "us",
78
- "``",
79
- },
80
- "num_top_words": 3,
81
- "lang": "english",
82
- }
83
- tasks = ["text_classification", "text_summarization"]
84
- tags = ["nlp", "text_data", "visualization"]
85
-
86
- def general_text_metrics(self, df, text_column):
87
- results = []
88
-
89
- for text in df[text_column]:
90
- sentences = nltk.sent_tokenize(text)
91
- words = nltk.word_tokenize(text)
92
- paragraphs = text.split("\n\n")
93
-
94
- total_words = len(words)
95
- total_sentences = len(sentences)
96
- avg_sentence_length = round(
97
- (
98
- sum(len(sentence.split()) for sentence in sentences)
99
- / total_sentences
100
- if total_sentences
101
- else 0
102
- ),
103
- 1,
142
+ if dataset.text_column is None:
143
+ raise ValueError("A 'text_column' must be provided to run this test.")
144
+
145
+ nltk.download("punkt_tab", quiet=True)
146
+
147
+ metrics_df = create_metrics_df(
148
+ dataset.df, dataset.text_column, unwanted_tokens, lang
149
+ )
150
+
151
+ combinations_to_plot = [
152
+ ("Total Words", "Total Sentences"),
153
+ ("Total Words", "Total Unique Words"),
154
+ ("Total Sentences", "Avg Sentence Length"),
155
+ ("Total Unique Words", "Lexical Diversity"),
156
+ ]
157
+
158
+ figures = []
159
+
160
+ # Create hist plots for each column
161
+ for column in metrics_df.columns:
162
+ fig = px.histogram(metrics_df, x=column)
163
+ fig.update_layout(bargap=0.2)
164
+ figures.append(fig)
165
+
166
+ for metric1, metric2 in combinations_to_plot:
167
+ figures.append(
168
+ px.scatter(
169
+ metrics_df,
170
+ x=metric1,
171
+ y=metric2,
172
+ title=f"Scatter Plot: {metric1} vs {metric2}",
104
173
  )
105
- total_paragraphs = len(paragraphs)
106
-
107
- results.append(
108
- [total_words, total_sentences, avg_sentence_length, total_paragraphs]
109
- )
110
-
111
- return pd.DataFrame(
112
- results,
113
- columns=[
114
- "Total Words",
115
- "Total Sentences",
116
- "Avg Sentence Length",
117
- "Total Paragraphs",
118
- ],
119
174
  )
120
175
 
121
- def vocabulary_structure_metrics(
122
- self, df, text_column, unwanted_tokens, num_top_words, lang
123
- ):
124
- stop_words = set(word.lower() for word in stopwords.words(lang))
125
- unwanted_tokens = set(token.lower() for token in unwanted_tokens)
126
-
127
- results = []
128
-
129
- for text in df[text_column]:
130
- words = nltk.word_tokenize(text)
131
-
132
- filtered_words = [
133
- word
134
- for word in words
135
- if word.lower() not in stop_words
136
- and word.lower() not in unwanted_tokens
137
- and word not in string.punctuation
138
- ]
139
-
140
- total_unique_words = len(set(filtered_words))
141
- total_punctuations = sum(1 for word in words if word in string.punctuation)
142
- lexical_diversity = round(
143
- total_unique_words / len(filtered_words) if filtered_words else 0, 1
144
- )
145
-
146
- results.append([total_unique_words, total_punctuations, lexical_diversity])
147
-
148
- return pd.DataFrame(
149
- results,
150
- columns=["Total Unique Words", "Total Punctuations", "Lexical Diversity"],
151
- )
152
-
153
- # Wrapper function that combines the outputs
154
- def text_description_table(self, df, params):
155
- text_column = self.inputs.dataset.text_column
156
- unwanted_tokens = params["unwanted_tokens"]
157
- num_top_words = params["num_top_words"]
158
- lang = params["lang"]
159
-
160
- gen_metrics_df = self.general_text_metrics(df, text_column)
161
- vocab_metrics_df = self.vocabulary_structure_metrics(
162
- df, text_column, unwanted_tokens, num_top_words, lang
163
- )
164
- combined_df = pd.concat([gen_metrics_df, vocab_metrics_df], axis=1)
165
-
166
- return combined_df
167
-
168
- def run(self):
169
- # Enforce that text_column must be provided as part of the params
170
- if self.inputs.dataset.text_column is None:
171
- raise ValueError("A 'text_column' must be provided to run this test.")
172
-
173
- # Can only run this test if we have a Dataset object
174
- if not isinstance(self.inputs.dataset, VMDataset):
175
- raise ValueError("TextDescription requires a validmind Dataset object")
176
-
177
- # download nltk data
178
- nltk.download("punkt_tab", quiet=True)
179
-
180
- df_text_description = self.text_description_table(
181
- self.inputs.dataset.df, self.params
182
- )
183
-
184
- # Define the combinations you want to plot
185
- combinations_to_plot = [
186
- ("Total Words", "Total Sentences"),
187
- ("Total Words", "Total Unique Words"),
188
- ("Total Sentences", "Avg Sentence Length"),
189
- ("Total Unique Words", "Lexical Diversity"),
190
- ]
191
- params = {"combinations_to_plot": combinations_to_plot}
192
- figures = self.text_description_plots(df_text_description, params)
193
-
194
- return self.cache_results(
195
- figures=figures,
196
- )
197
-
198
- # Function to plot scatter plots for specified combinations using Plotly
199
- def text_description_plots(self, df, params):
200
- combinations_to_plot = params["combinations_to_plot"]
201
- figures = []
202
- # Create hist plots for each column
203
- for i, column in enumerate(df.columns):
204
- fig = px.histogram(df, x=column)
205
- fig.update_layout(bargap=0.2)
206
- # Generate a unique key for each histogram using the column name and index
207
- histogram_key = f"{self.name}_histogram_{column}_{i}"
208
- figures.append(Figure(for_object=self, key=histogram_key, figure=fig))
209
-
210
- for j, (metric1, metric2) in enumerate(combinations_to_plot):
211
- fig = px.scatter(
212
- df, x=metric1, y=metric2, title=f"Scatter Plot: {metric1} vs {metric2}"
213
- )
214
- # Generate a unique key for each scatter plot using the metric names and index
215
- scatter_key = f"{self.name}_scatter_{metric1}_vs_{metric2}_{j}"
216
- figures.append(Figure(for_object=self, key=scatter_key, figure=fig))
217
- plt.close("all")
218
-
219
- return figures
176
+ return tuple(figures)
@@ -49,9 +49,15 @@ def Toxicity(dataset):
49
49
  - Does not provide context-specific insights, which may be necessary for nuanced understanding.
50
50
  - May not capture all forms of subtle or indirect toxic language.
51
51
  """
52
+
53
+ # Check text column
54
+ if not dataset.text_column:
55
+ raise ValueError("Please set text_column name in the Validmind Dataset object")
56
+
57
+ text_inputs = dataset.df[dataset.text_column].tolist()
58
+
52
59
  toxicity = evaluate.load("toxicity")
53
- input_text = dataset.df[dataset.text_column]
54
- toxicity_scores = toxicity.compute(predictions=list(input_text.values))["toxicity"]
60
+ toxicity_scores = toxicity.compute(predictions=text_inputs)["toxicity"]
55
61
 
56
62
  fig = plt.figure()
57
63
  ax = sns.kdeplot(
@@ -62,7 +68,9 @@ def Toxicity(dataset):
62
68
  alpha=0.5,
63
69
  linewidth=0,
64
70
  )
65
- ax.set_title(f"Toxicity score of {dataset.text_column} ")
71
+ ax.set_title(f"Toxicity score of {dataset.text_column}")
66
72
  ax.set_xlabel("Toxicity score")
67
- plt.close("all")
73
+
74
+ plt.close()
75
+
68
76
  return fig