validmind 2.1.1__py3-none-any.whl → 2.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai.py +3 -3
  3. validmind/api_client.py +2 -3
  4. validmind/client.py +68 -25
  5. validmind/datasets/llm/rag/__init__.py +11 -0
  6. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
  7. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
  8. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
  9. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
  10. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
  11. validmind/datasets/llm/rag/rfp.py +41 -0
  12. validmind/html_templates/__init__.py +0 -0
  13. validmind/html_templates/content_blocks.py +89 -14
  14. validmind/models/__init__.py +7 -4
  15. validmind/models/foundation.py +8 -34
  16. validmind/models/function.py +51 -0
  17. validmind/models/huggingface.py +16 -46
  18. validmind/models/metadata.py +42 -0
  19. validmind/models/pipeline.py +66 -0
  20. validmind/models/pytorch.py +8 -42
  21. validmind/models/r_model.py +33 -82
  22. validmind/models/sklearn.py +39 -38
  23. validmind/template.py +8 -26
  24. validmind/tests/__init__.py +43 -20
  25. validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
  26. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
  27. validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
  28. validmind/tests/data_validation/Duplicates.py +1 -1
  29. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  30. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
  31. validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
  32. validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
  33. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
  34. validmind/tests/data_validation/nlp/Punctuations.py +11 -12
  35. validmind/tests/data_validation/nlp/Sentiment.py +57 -0
  36. validmind/tests/data_validation/nlp/Toxicity.py +45 -0
  37. validmind/tests/decorator.py +2 -2
  38. validmind/tests/model_validation/BertScore.py +100 -98
  39. validmind/tests/model_validation/BleuScore.py +93 -64
  40. validmind/tests/model_validation/ContextualRecall.py +74 -91
  41. validmind/tests/model_validation/MeteorScore.py +86 -74
  42. validmind/tests/model_validation/RegardScore.py +103 -121
  43. validmind/tests/model_validation/RougeScore.py +118 -0
  44. validmind/tests/model_validation/TokenDisparity.py +84 -121
  45. validmind/tests/model_validation/ToxicityScore.py +109 -123
  46. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
  47. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
  48. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
  49. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
  50. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
  51. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
  52. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
  53. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
  54. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
  55. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
  56. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
  57. validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
  58. validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
  59. validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
  60. validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
  61. validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
  62. validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
  63. validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
  64. validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
  65. validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
  66. validmind/tests/model_validation/ragas/utils.py +66 -0
  67. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
  68. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
  69. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
  70. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
  71. validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
  72. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  73. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
  74. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -11
  75. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
  76. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
  77. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
  78. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  79. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  80. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  81. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
  82. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  83. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
  84. validmind/unit_metrics/__init__.py +26 -49
  85. validmind/unit_metrics/composite.py +5 -1
  86. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
  87. validmind/utils.py +56 -6
  88. validmind/vm_models/__init__.py +1 -1
  89. validmind/vm_models/dataset/__init__.py +7 -0
  90. validmind/vm_models/dataset/dataset.py +558 -0
  91. validmind/vm_models/dataset/utils.py +146 -0
  92. validmind/vm_models/model.py +97 -72
  93. validmind/vm_models/test/result_wrapper.py +61 -24
  94. validmind/vm_models/test_context.py +1 -1
  95. validmind/vm_models/test_suite/summary.py +3 -4
  96. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/METADATA +5 -3
  97. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/RECORD +100 -75
  98. validmind/models/catboost.py +0 -33
  99. validmind/models/statsmodels.py +0 -50
  100. validmind/models/xgboost.py +0 -30
  101. validmind/tests/model_validation/BertScoreAggregate.py +0 -90
  102. validmind/tests/model_validation/RegardHistogram.py +0 -148
  103. validmind/tests/model_validation/RougeMetrics.py +0 -147
  104. validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
  105. validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
  106. validmind/tests/model_validation/ToxicityHistogram.py +0 -136
  107. validmind/vm_models/dataset.py +0 -1303
  108. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/LICENSE +0 -0
  109. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/WHEEL +0 -0
  110. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,118 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import pandas as pd
6
+ import plotly.graph_objects as go
7
+ from rouge import Rouge
8
+
9
+ from validmind import tags, tasks
10
+
11
+
12
+ @tags("nlp", "text_data", "visualization")
13
+ @tasks("text_classification", "text_summarization")
14
+ def RougeScore(dataset, model, metric="rouge-1"):
15
+ """
16
+ Evaluates the quality of machine-generated text using ROUGE metrics and visualizes the results through histograms
17
+ and bar charts, alongside compiling a comprehensive table of descriptive statistics for each ROUGE metric.
18
+
19
+ **Purpose:**
20
+ This function is designed to assess the quality of text generated by machine learning models using various ROUGE metrics.
21
+ ROUGE, which stands for Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics used to evaluate the
22
+ overlap of n-grams, word sequences, and word pairs between the machine-generated text and reference texts. This evaluation
23
+ is crucial for tasks such as text summarization, machine translation, and text generation, where the goal is to produce text
24
+ that accurately reflects the content and meaning of human-crafted references.
25
+
26
+ **Test Mechanism:**
27
+ The function starts by extracting the true and predicted values from the provided dataset and model. It then initializes the ROUGE
28
+ evaluator with the specified metric (e.g., ROUGE-1). For each pair of true and predicted texts, the function calculates the ROUGE
29
+ scores and compiles them into a dataframe. Histograms and bar charts are generated for each ROUGE metric (Precision, Recall, and F1 Score)
30
+ to visualize their distribution. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum)
31
+ is compiled for each metric, providing a comprehensive summary of the model's performance.
32
+
33
+ **Signs of High Risk:**
34
+
35
+ - Consistently low scores across ROUGE metrics could indicate poor quality in the generated text, suggesting that the model fails
36
+ to capture the essential content of the reference texts.
37
+ - Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
38
+ - Low recall scores may indicate that important information from the reference text is being omitted.
39
+ - An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the model's ability
40
+ to balance informativeness and conciseness.
41
+
42
+ **Strengths:**
43
+
44
+ - Provides a multifaceted evaluation of text quality through different ROUGE metrics, offering a detailed view of model performance.
45
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the scores.
46
+ - Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
47
+
48
+ **Limitations:**
49
+
50
+ - ROUGE metrics primarily focus on n-gram overlap and may not fully capture semantic coherence, fluency, or grammatical quality of the text.
51
+ - The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
52
+ - While useful for comparison, ROUGE scores alone do not provide a complete assessment of a model's performance and should be
53
+ supplemented with other metrics and qualitative analysis.
54
+ """
55
+
56
+ # Extract true and predicted values
57
+ y_true = dataset.y
58
+ y_pred = dataset.y_pred(model)
59
+
60
+ # Initialize Rouge with the specified metric
61
+ rouge = Rouge(metrics=[metric])
62
+
63
+ # Calculate ROUGE scores
64
+ score_list = []
65
+ for y_t, y_p in zip(y_true, y_pred):
66
+ scores = rouge.get_scores(y_p, y_t, avg=True)
67
+ score_list.append(scores)
68
+
69
+ # Convert scores to a dataframe
70
+ metrics_df = pd.DataFrame(score_list)
71
+ df_scores = pd.DataFrame(metrics_df[metric].tolist())
72
+
73
+ # Generate histograms and bar charts for each score type
74
+ score_types = ["p", "r", "f"]
75
+ score_names = ["Precision", "Recall", "F1 Score"]
76
+ figures = []
77
+
78
+ for score_type, score_name in zip(score_types, score_names):
79
+ # Histogram
80
+ hist_fig = go.Figure(data=[go.Histogram(x=df_scores[score_type])])
81
+ hist_fig.update_layout(
82
+ title=f"{score_name} Histogram for {metric.upper()}",
83
+ xaxis_title=score_name,
84
+ yaxis_title="Count",
85
+ )
86
+ figures.append(hist_fig)
87
+
88
+ # Bar Chart
89
+ bar_fig = go.Figure(data=[go.Bar(x=df_scores.index, y=df_scores[score_type])])
90
+ bar_fig.update_layout(
91
+ title=f"{score_name} Bar Chart for {metric.upper()}",
92
+ xaxis_title="Row Index",
93
+ yaxis_title=score_name,
94
+ )
95
+ figures.append(bar_fig)
96
+
97
+ # Calculate statistics for each score type
98
+ stats_df = df_scores.describe().loc[["mean", "50%", "max", "min", "std"]]
99
+ stats_df = stats_df.rename(
100
+ index={
101
+ "mean": "Mean Score",
102
+ "50%": "Median Score",
103
+ "max": "Max Score",
104
+ "min": "Min Score",
105
+ "std": "Standard Deviation",
106
+ }
107
+ ).T
108
+ stats_df["Count"] = len(df_scores)
109
+
110
+ # Rename metrics for clarity
111
+ stats_df.index = stats_df.index.map(
112
+ {"p": "Precision", "r": "Recall", "f": "F1 Score"}
113
+ )
114
+
115
+ # Create a DataFrame from all collected statistics
116
+ result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
117
+
118
+ return (result_df, *tuple(figures))
@@ -2,139 +2,102 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import itertools
6
- from dataclasses import dataclass
7
-
8
5
  import pandas as pd
9
6
  import plotly.graph_objects as go
10
- from plotly.subplots import make_subplots
11
- from transformers import BertTokenizer
12
7
 
13
- from validmind.vm_models import Figure, Metric
8
+ from validmind import tags, tasks
14
9
 
15
10
 
16
- @dataclass
17
- class TokenDisparity(Metric):
11
+ @tags("nlp", "text_data", "visualization")
12
+ @tasks("text_classification", "text_summarization")
13
+ def TokenDisparity(dataset, model):
18
14
  """
19
- Assess and visualize token count disparity between model's predicted and actual dataset.
20
-
21
- **Purpose**:
22
- The Token Disparity metric is designed to assess the distributional congruence between the model's predicted
23
- outputs and the actual data. This is achieved by constructing histograms that illustrate the disparity in token
24
- count between the two columns. Additionally, this metric is used to measure the model's verbosity in comparison to
25
- the genuine dataset.
26
-
27
- **Test Mechanism**:
28
- The mechanism of running this test involves tokenizing both columns: one containing the actual data and the other
29
- containing the model's predictions. The BERT tokenizer is used for tokenizing the contents of each column. After
30
- tokenization, tokens in each column are counted and represented in two distinct histograms to facilitate the
31
- visualization of token count distribution in the actual and predicted data. To quantify the difference in
32
- distribution, the histogram of the actual tokens is compared with the histogram of the predicted tokens.
33
-
34
- **Signs of High Risk**:
35
- High risk or potential failure in model performance may be suggested by:
36
-
37
- - Significant incongruities in distribution patterns between the two histograms.
38
- - Marked divergence of the predicted histogram from the reference histogram, indicating that the model may be
39
- generating output with unexpected verbosity.
40
- - This might result in an output that has a significantly higher or lower number of tokens than expected.
41
-
42
- **Strengths**:
43
- Strengths of the Token Disparity metric include:
44
-
45
- - It provides a clear and visual comparison of predicted versus actual token distributions, enhancing understanding
46
- of the model's output consistency and verbosity.
47
- - It is able to detect potential issues with the model's output generation capability, such as over-production or
48
- under-production of tokens compared to the actual data set.
49
-
50
- **Limitations**:
51
- Limitations of the Token Disparity metric include:
52
-
53
- - The metric focuses solely on token count, disregarding the semantics behind those tokens. Consequently, it may
54
- miss out on issues related to relevance or meaningfulness of produced tokens.
55
- - The assumption that similar token count between predicted and actual data suggests accurate output, which is not
56
- always the case.
57
- - Dependence on the BERT tokenizer, which may not always be the optimum choice for all types of text data.
15
+ Evaluates the token disparity between reference and generated texts, visualizing the results through histograms
16
+ and bar charts, alongside compiling a comprehensive table of descriptive statistics for token counts.
17
+
18
+ **Purpose:**
19
+ This function is designed to assess the token disparity between reference and generated texts. Token disparity is
20
+ important for understanding how closely the length and token usage of generated texts match the reference texts.
21
+
22
+ **Test Mechanism:**
23
+ The function starts by extracting the true and predicted values from the provided dataset and model. It then calculates
24
+ the number of tokens in each reference and generated text. Histograms and bar charts are generated for the token counts
25
+ of both reference and generated texts to visualize their distribution. Additionally, a table of descriptive statistics
26
+ (mean, median, standard deviation, minimum, and maximum) is compiled for the token counts, providing a comprehensive
27
+ summary of the model's performance.
28
+
29
+ **Signs of High Risk:**
30
+ - Significant disparity in token counts between reference and generated texts could indicate issues with text generation
31
+ quality, such as verbosity or lack of detail.
32
+ - Consistently low token counts in generated texts compared to references might suggest that the model is producing
33
+ incomplete or overly concise outputs.
34
+
35
+ **Strengths:**
36
+ - Provides a simple yet effective evaluation of text length and token usage.
37
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of token counts.
38
+ - Descriptive statistics offer a concise summary of the model's performance in generating texts of appropriate length.
39
+
40
+ **Limitations:**
41
+ - Token counts alone do not provide a complete assessment of text quality and should be supplemented with other metrics and qualitative analysis.
58
42
  """
59
43
 
60
- name = "token_disparity"
61
- required_inputs = ["model", "dataset"]
62
-
63
- def run(self):
64
- y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
65
- y_pred = self.inputs.dataset.y_pred(self.inputs.model)
66
-
67
- df = pd.DataFrame({"reference_column": y_true, "generated_column": y_pred})
68
-
69
- fig = self.token_disparity_histograms(df)
70
- figures = []
71
- figures.append(
72
- Figure(
73
- for_object=self,
74
- key=self.key,
75
- figure=fig,
76
- )
77
- )
78
- return self.cache_results(figures=figures)
79
-
80
- def token_disparity_histograms(self, df):
81
- """
82
- Visualize the token counts distribution of two given columns using histograms.
83
-
84
- :param df: DataFrame containing the text columns.
85
- :param params: Dictionary with the keys ["reference_column", "generated_column"].
86
- """
44
+ # Extract true and predicted values
45
+ y_true = dataset.y
46
+ y_pred = dataset.y_pred(model)
87
47
 
88
- reference_column = "reference_column"
89
- generated_column = "generated_column"
48
+ # Calculate token counts
49
+ token_counts_true = [len(text.split()) for text in y_true]
50
+ token_counts_pred = [len(text.split()) for text in y_pred]
90
51
 
91
- # Initialize the tokenizer
92
- tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
52
+ # Create a dataframe for reference and generated token counts
53
+ df = pd.DataFrame(
54
+ {"reference_tokens": token_counts_true, "generated_tokens": token_counts_pred}
55
+ )
93
56
 
94
- # Tokenize the columns and get the number of tokens
95
- df["tokens_1"] = df[reference_column].apply(
96
- lambda x: len(tokenizer.tokenize(x))
97
- )
98
- df["tokens_2"] = df[generated_column].apply(
99
- lambda x: len(tokenizer.tokenize(x))
100
- )
57
+ figures = []
101
58
 
102
- # Create subplots: 1 row, 2 columns
103
- fig = make_subplots(
104
- rows=1,
105
- cols=2,
106
- subplot_titles=(
107
- f"Tokens in {reference_column}",
108
- f"Tokens in {generated_column}",
109
- ),
110
- )
59
+ # Generate histograms and bar charts for reference and generated token counts
60
+ token_types = ["reference_tokens", "generated_tokens"]
61
+ token_names = ["Reference Tokens", "Generated Tokens"]
111
62
 
112
- # Add histograms
113
- fig.add_trace(
114
- go.Histogram(
115
- x=df["tokens_1"],
116
- marker_color="blue",
117
- name=f"Tokens in {reference_column}",
118
- ),
119
- row=1,
120
- col=1,
63
+ for token_type, token_name in zip(token_types, token_names):
64
+ # Histogram
65
+ hist_fig = go.Figure(data=[go.Histogram(x=df[token_type])])
66
+ hist_fig.update_layout(
67
+ title=f"{token_name} Histogram",
68
+ xaxis_title=token_name,
69
+ yaxis_title="Count",
121
70
  )
122
-
123
- fig.add_trace(
124
- go.Histogram(
125
- x=df["tokens_2"],
126
- marker_color="red",
127
- name=f"Tokens in {generated_column}",
128
- ),
129
- row=1,
130
- col=2,
71
+ figures.append(hist_fig)
72
+
73
+ # Bar Chart
74
+ bar_fig = go.Figure(data=[go.Bar(x=df.index, y=df[token_type])])
75
+ bar_fig.update_layout(
76
+ title=f"{token_name} Bar Chart",
77
+ xaxis_title="Row Index",
78
+ yaxis_title=token_name,
131
79
  )
132
-
133
- # Update layout
134
- fig.update_layout(title_text="Token Distributions", bargap=0.1)
135
-
136
- fig.update_yaxes(title_text="Number of Documents")
137
- fig.update_xaxes(title_text="Number of Tokens", row=1, col=1)
138
- fig.update_xaxes(title_text="Number of Tokens", row=1, col=2)
139
-
140
- return fig
80
+ figures.append(bar_fig)
81
+
82
+ # Calculate statistics for each token count type
83
+ stats_df = df.describe().loc[["mean", "50%", "max", "min", "std"]]
84
+ stats_df = stats_df.rename(
85
+ index={
86
+ "mean": "Mean Count",
87
+ "50%": "Median Count",
88
+ "max": "Max Count",
89
+ "min": "Min Count",
90
+ "std": "Standard Deviation",
91
+ }
92
+ ).T
93
+ stats_df["Count"] = len(df)
94
+
95
+ # Rename columns for clarity
96
+ stats_df.index = stats_df.index.map(
97
+ {"reference_tokens": "Reference Tokens", "generated_tokens": "Generated Tokens"}
98
+ )
99
+
100
+ # Create a DataFrame from all collected statistics
101
+ result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
102
+
103
+ return (result_df, *tuple(figures))
@@ -2,146 +2,132 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import itertools
6
- from dataclasses import dataclass
7
-
8
5
  import evaluate
9
6
  import pandas as pd
10
7
  import plotly.graph_objects as go
11
- import plotly.subplots as sp
12
8
 
13
- from validmind.vm_models import Figure, Metric
9
+ from validmind import tags, tasks
14
10
 
15
11
 
16
- @dataclass
17
- class ToxicityScore(Metric):
12
+ @tags("nlp", "text_data", "visualization")
13
+ @tasks("text_classification", "text_summarization")
14
+ def ToxicityScore(dataset, model):
18
15
  """
16
+ Computes and visualizes the toxicity score for input text, true text, and predicted text, assessing content quality and potential risk.
17
+
19
18
  **Purpose:**
20
- The ToxicityScore metric is designed to present a sequential representation of toxicity scores for various texts.
21
- Leveraging line plots, it gives an overview of how toxicity scores evolve across the sequence of texts, highlighting
22
- trends and patterns.
19
+ The ToxicityScore metric is designed to evaluate the toxicity levels of texts generated by models. This is crucial for
20
+ identifying and mitigating harmful or offensive content in machine-generated texts.
23
21
 
24
22
  **Test Mechanism:**
25
- The mechanism involves fetching texts from specific columns, computing their toxicity scores using a preloaded
26
- `toxicity` evaluation tool, and then plotting these scores. A multi-panel visualization is created where each
27
- panel is dedicated to a specific text data column. Line plots serve as the primary visual tool, showing the progression of toxicity scores across text sequences. Each
28
- line plot corresponds to a specific text data column, illustrating the variation in toxicity scores as one moves
29
- from one text segment to the next.
23
+ The function starts by extracting the input, true, and predicted values from the provided dataset and model. The toxicity score is
24
+ computed for each text using a preloaded `toxicity` evaluation tool. The scores are compiled into dataframes, and histograms
25
+ and bar charts are generated to visualize the distribution of toxicity scores. Additionally, a table of descriptive statistics
26
+ (mean, median, standard deviation, minimum, and maximum) is compiled for the toxicity scores, providing a comprehensive
27
+ summary of the model's performance.
30
28
 
31
29
  **Signs of High Risk:**
32
- Drastic spikes in the line plots, especially those that reach high toxicity values, indicate potentially toxic
33
- content within the associated text segment. If predicted summaries diverge significantly from input or target
34
- texts, it could be indicative of issues in the model's generated content.
30
+ - Drastic spikes in toxicity scores indicate potentially toxic content within the associated text segment.
31
+ - Persistent high toxicity scores across multiple texts may suggest systemic issues in the model's text generation process.
35
32
 
36
33
  **Strengths:**
37
- The ToxicityScore offers a dynamic view of toxicity trends, enabling users to detect patterns or irregularities
38
- across the dataset. This is particularly valuable when comparing predicted content with actual data, helping
39
- highlight any inconsistencies or abnormalities in model output.
34
+ - Provides a clear evaluation of toxicity levels in generated texts, helping to ensure content safety and appropriateness.
35
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of toxicity scores.
36
+ - Descriptive statistics offer a concise summary of the model's performance in generating non-toxic texts.
40
37
 
41
38
  **Limitations:**
42
- This metric’s accuracy is contingent upon the underlying `toxicity` tool. The line plots provide a broad overview
43
- of toxicity trends but do not specify which portions or tokens of the text are responsible for high toxicity scores.
44
- Consequently, for granular insights, supplementary, in-depth analysis might be needed.
39
+ - The accuracy of the toxicity scores is contingent upon the underlying `toxicity` tool.
40
+ - The scores provide a broad overview but do not specify which portions or tokens of the text are responsible for high toxicity.
41
+ - Supplementary, in-depth analysis might be needed for granular insights.
45
42
  """
46
43
 
47
- name = "toxicity_line_plot"
48
- required_inputs = ["model", "dataset"]
49
- metadata = {
50
- "task_types": [
51
- "text_classification",
52
- "text_summarization",
53
- ],
54
- "tags": ["toxicity_line_plot"],
55
- }
56
-
57
- def _get_datasets(self):
58
- # Check model attributes
59
- if not hasattr(self, "model"):
60
- raise AttributeError("The 'model' attribute is missing.")
61
-
62
- y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
63
- y_pred = self.inputs.dataset.y_pred(self.inputs.model)
64
- input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
65
-
66
- # Ensure consistency in lengths
67
- if not len(y_true) == len(y_pred) == len(input_text):
68
- raise ValueError(
69
- "Inconsistent lengths among input text, true summaries, and predicted summaries."
70
- )
71
-
72
- return input_text, y_true, y_pred
73
-
74
- def toxicity_line_plots(self, df):
75
- """
76
- Compute toxicity scores for texts and then plot line plots for all columns of df.
77
-
78
- Parameters:
79
- - df (pd.DataFrame): The dataframe containing texts.
80
- """
81
-
82
- # Extract necessary parameters
83
- toxicity = evaluate.load("toxicity")
84
-
85
- # Get all columns of df
86
- text_columns = df.columns.tolist()
87
-
88
- # Determine the number of rows required based on the number of text columns
89
- num_rows = (len(text_columns) + 1) // 2
90
-
91
- # Create a subplot layout
92
- fig = sp.make_subplots(rows=num_rows, cols=2, subplot_titles=text_columns)
93
-
94
- subplot_height = 350
95
- total_height = num_rows * subplot_height + 200
96
-
97
- for idx, col in enumerate(text_columns, start=1):
98
- row = (idx - 1) // 2 + 1
99
- col_idx = (idx - 1) % 2 + 1
100
-
101
- # Get list of texts from dataframe
102
- texts = df[col].tolist()
103
-
104
- # Compute toxicity for texts
105
- toxicity_scores = toxicity.compute(predictions=texts)["toxicity"]
106
-
107
- # Add traces to the corresponding subplot
108
- fig.add_trace(
109
- go.Scatter(
110
- y=toxicity_scores,
111
- mode="lines+markers",
112
- marker=dict(size=5),
113
- line=dict(width=1.5),
114
- showlegend=False,
115
- ),
116
- row=row,
117
- col=col_idx,
118
- )
119
-
120
- # Update xaxes and yaxes titles only for the first subplot
121
- if idx == 1:
122
- fig.update_xaxes(title_text="Text Index", row=row, col=col_idx)
123
- fig.update_yaxes(title_text="Toxicity Score", row=row, col=col_idx)
124
-
125
- # Update layout
126
- fig.update_layout(
127
- title_text="Line Plots of Toxicity Scores", height=total_height
128
- )
129
-
130
- return fig
131
-
132
- def run(self):
133
- input_text, y_true, y_pred = self._get_datasets()
134
-
135
- df = pd.DataFrame(
136
- {
137
- "Input Text": input_text,
138
- "Target Text": y_true,
139
- "Predicted Summaries": y_pred,
140
- }
44
+ # Extract true, predicted, and input values
45
+ y_true = dataset.y
46
+ y_pred = dataset.y_pred(model)
47
+ input_text = dataset.df[dataset.text_column]
48
+
49
+ # Load the toxicity evaluation metric
50
+ toxicity = evaluate.load("toxicity")
51
+
52
+ # Function to calculate toxicity scores
53
+ def compute_toxicity_scores(texts):
54
+ scores = []
55
+ for text in texts:
56
+ score = toxicity.compute(predictions=[text])
57
+ scores.append(score["toxicity"])
58
+ return scores
59
+
60
+ # Calculate toxicity scores for input, true, and predicted texts
61
+ input_toxicity = compute_toxicity_scores(input_text)
62
+ true_toxicity = compute_toxicity_scores(y_true)
63
+ pred_toxicity = compute_toxicity_scores(y_pred)
64
+
65
+ # Convert scores to dataframes
66
+ input_df = pd.DataFrame(input_toxicity, columns=["Input Text Toxicity"])
67
+ true_df = pd.DataFrame(true_toxicity, columns=["True Text Toxicity"])
68
+ pred_df = pd.DataFrame(pred_toxicity, columns=["Predicted Text Toxicity"])
69
+
70
+ figures = []
71
+
72
+ # Function to create histogram and bar chart for toxicity scores
73
+ def create_figures(df, title):
74
+ # Histogram
75
+ hist_fig = go.Figure(data=[go.Histogram(x=df.iloc[:, 0])])
76
+ hist_fig.update_layout(
77
+ title=f"{title} Histogram",
78
+ xaxis_title=title,
79
+ yaxis_title="Count",
141
80
  )
142
-
143
- fig = self.toxicity_line_plots(df)
144
-
145
- return self.cache_results(
146
- figures=[Figure(for_object=self, key=self.key, figure=fig)]
81
+ figures.append(hist_fig)
82
+
83
+ # Bar Chart
84
+ bar_fig = go.Figure(data=[go.Bar(x=df.index, y=df.iloc[:, 0])])
85
+ bar_fig.update_layout(
86
+ title=f"{title} Bar Chart",
87
+ xaxis_title="Text Instance Index",
88
+ yaxis_title=title,
147
89
  )
90
+ figures.append(bar_fig)
91
+
92
+ # Create figures for each toxicity score dataframe
93
+ create_figures(input_df, "Input Text Toxicity")
94
+ create_figures(true_df, "True Text Toxicity")
95
+ create_figures(pred_df, "Predicted Text Toxicity")
96
+
97
+ # Calculate statistics for each toxicity score dataframe
98
+ def calculate_stats(df):
99
+ stats = df.describe().loc[["mean", "50%", "max", "min", "std"]].T
100
+ stats.columns = [
101
+ "Mean Score",
102
+ "Median Score",
103
+ "Max Score",
104
+ "Min Score",
105
+ "Standard Deviation",
106
+ ]
107
+ stats["Metric"] = df.columns[0]
108
+ stats["Count"] = len(df)
109
+ return stats
110
+
111
+ input_stats = calculate_stats(input_df)
112
+ true_stats = calculate_stats(true_df)
113
+ pred_stats = calculate_stats(pred_df)
114
+
115
+ # Combine statistics into a single dataframe
116
+ result_df = (
117
+ pd.concat([input_stats, true_stats, pred_stats])
118
+ .reset_index()
119
+ .rename(columns={"index": "Statistic"})
120
+ )
121
+ result_df = result_df[
122
+ [
123
+ "Metric",
124
+ "Mean Score",
125
+ "Median Score",
126
+ "Max Score",
127
+ "Min Score",
128
+ "Standard Deviation",
129
+ "Count",
130
+ ]
131
+ ]
132
+
133
+ return (result_df, *tuple(figures))