validmind 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai.py +3 -3
  3. validmind/api_client.py +2 -3
  4. validmind/client.py +68 -25
  5. validmind/datasets/llm/rag/__init__.py +11 -0
  6. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
  7. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
  8. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
  9. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
  10. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
  11. validmind/datasets/llm/rag/rfp.py +41 -0
  12. validmind/html_templates/__init__.py +0 -0
  13. validmind/html_templates/content_blocks.py +89 -14
  14. validmind/models/__init__.py +7 -4
  15. validmind/models/foundation.py +8 -34
  16. validmind/models/function.py +51 -0
  17. validmind/models/huggingface.py +16 -46
  18. validmind/models/metadata.py +42 -0
  19. validmind/models/pipeline.py +66 -0
  20. validmind/models/pytorch.py +8 -42
  21. validmind/models/r_model.py +33 -82
  22. validmind/models/sklearn.py +39 -38
  23. validmind/template.py +8 -26
  24. validmind/tests/__init__.py +43 -20
  25. validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
  26. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
  27. validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
  28. validmind/tests/data_validation/Duplicates.py +1 -1
  29. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  30. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
  31. validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
  32. validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
  33. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
  34. validmind/tests/data_validation/nlp/Punctuations.py +11 -12
  35. validmind/tests/data_validation/nlp/Sentiment.py +57 -0
  36. validmind/tests/data_validation/nlp/Toxicity.py +45 -0
  37. validmind/tests/decorator.py +2 -2
  38. validmind/tests/model_validation/BertScore.py +100 -98
  39. validmind/tests/model_validation/BleuScore.py +93 -64
  40. validmind/tests/model_validation/ContextualRecall.py +74 -91
  41. validmind/tests/model_validation/MeteorScore.py +86 -74
  42. validmind/tests/model_validation/RegardScore.py +103 -121
  43. validmind/tests/model_validation/RougeScore.py +118 -0
  44. validmind/tests/model_validation/TokenDisparity.py +84 -121
  45. validmind/tests/model_validation/ToxicityScore.py +109 -123
  46. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
  47. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
  48. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
  49. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
  50. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
  51. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
  52. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
  53. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
  54. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
  55. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
  56. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
  57. validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
  58. validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
  59. validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
  60. validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
  61. validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
  62. validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
  63. validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
  64. validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
  65. validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
  66. validmind/tests/model_validation/ragas/utils.py +66 -0
  67. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
  68. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
  69. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
  70. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
  71. validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
  72. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  73. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
  74. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +14 -12
  75. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
  76. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
  77. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
  78. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  79. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  80. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  81. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
  82. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  83. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
  84. validmind/unit_metrics/__init__.py +26 -49
  85. validmind/unit_metrics/composite.py +5 -1
  86. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
  87. validmind/utils.py +56 -6
  88. validmind/vm_models/__init__.py +1 -1
  89. validmind/vm_models/dataset/__init__.py +7 -0
  90. validmind/vm_models/dataset/dataset.py +558 -0
  91. validmind/vm_models/dataset/utils.py +146 -0
  92. validmind/vm_models/model.py +97 -72
  93. validmind/vm_models/test/result_wrapper.py +61 -24
  94. validmind/vm_models/test_context.py +1 -1
  95. validmind/vm_models/test_suite/summary.py +3 -4
  96. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/METADATA +5 -3
  97. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/RECORD +100 -75
  98. validmind/models/catboost.py +0 -33
  99. validmind/models/statsmodels.py +0 -50
  100. validmind/models/xgboost.py +0 -30
  101. validmind/tests/model_validation/BertScoreAggregate.py +0 -90
  102. validmind/tests/model_validation/RegardHistogram.py +0 -148
  103. validmind/tests/model_validation/RougeMetrics.py +0 -147
  104. validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
  105. validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
  106. validmind/tests/model_validation/ToxicityHistogram.py +0 -136
  107. validmind/vm_models/dataset.py +0 -1303
  108. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/LICENSE +0 -0
  109. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/WHEEL +0 -0
  110. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/entry_points.txt +0 -0
@@ -1,148 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- import itertools
6
- from dataclasses import dataclass
7
-
8
- import evaluate
9
- import plotly.graph_objects as go
10
- from plotly.subplots import make_subplots
11
-
12
- from validmind.vm_models import Figure, Metric
13
-
14
-
15
- @dataclass
16
- class RegardHistogram(Metric):
17
- """
18
- **Purpose:**
19
- The `RegardHistogram` metric offers a histogram-based visualization of regard scores across different text
20
- samples. As an evolution from the line plot representation, the histogram provides a distributional perspective
21
- on how often certain regard scores (positive, negative, neutral, or other) are perceived or generated by the model.
22
-
23
- **Test Mechanism:**
24
- This metric extracts the necessary data from the model's test dataset: the input text, the true regard (target text),
25
- and the model's predicted regard. After ensuring data consistency, the `evaluate.load("regard")` tool computes the
26
- regard scores. Histograms are then created for each category of regard against input, target, and predicted texts.
27
- These histograms illustrate the frequency distribution of scores within each category, shedding light on commonalities
28
- or outliers in the model's performance.
29
-
30
- **Signs of High Risk:**
31
- Any noticeable skewness in the histogram, especially when comparing the predicted regard scores with the target regard
32
- scores, could indicate biases or inconsistencies in the model. For instance, a lack of neutral scores in the model's
33
- predictions, despite a balanced distribution in the target data, might signal an issue.
34
-
35
- **Strengths:**
36
- Histogram representations give a quick, intuitive snapshot of score distributions. The immediate visual contrast
37
- between the model's predictions and target data is useful for stakeholders and researchers aiming to gauge
38
- model reliability in regard to sentiment assessments.
39
-
40
- **Limitations:**
41
- The efficacy of `RegardHistogram` hinges upon the precision of underlying tools like `evaluate.load("regard")`, which
42
- might have their biases or inaccuracies. The metric portrays regard in set categories, but sentiments in real-world scenarios
43
- can be more nuanced. Assumptions made, like the expectation of consistent regard across texts, may not always hold, given that
44
- sentiments can be subjective. While histograms showcase distributions, they may not capture the intricate contexts behind
45
- texts, possibly leading to oversimplifications or misinterpretations.
46
- """
47
-
48
- name = "regard_histogram"
49
- required_inputs = ["model", "dataset"]
50
- metadata = {
51
- "task_types": ["text_classification", "text_summarization"],
52
- "tags": ["regard_histogram"],
53
- }
54
-
55
- def _get_datasets(self):
56
- if not hasattr(self, "model"):
57
- raise AttributeError("The 'model' attribute is missing.")
58
-
59
- y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
60
- y_pred = self.inputs.dataset.y_pred(self.inputs.model)
61
-
62
- if not len(y_true) == len(y_pred):
63
- raise ValueError(
64
- "Inconsistent lengths among true summaries and predicted summaries."
65
- )
66
-
67
- return y_true, y_pred
68
-
69
- def regard_histogram(self):
70
- regard_tool = evaluate.load("regard")
71
- y_true, y_pred = self._get_datasets()
72
-
73
- dataframes = {
74
- "Target Text": y_true,
75
- "Predicted Summaries": y_pred,
76
- }
77
-
78
- total_text_columns = len(dataframes)
79
- total_rows = total_text_columns * 2
80
-
81
- categories_order = ["positive", "negative", "neutral", "other"]
82
- category_colors = {
83
- "negative": "#d9534f",
84
- "neutral": "#5bc0de",
85
- "other": "#f0ad4e",
86
- "positive": "#5cb85c",
87
- }
88
-
89
- fig = make_subplots(
90
- rows=total_rows,
91
- cols=2,
92
- subplot_titles=[
93
- f"{col_name} {cat}"
94
- for col_name in dataframes
95
- for cat in categories_order
96
- ],
97
- shared_xaxes=True,
98
- vertical_spacing=0.1,
99
- )
100
-
101
- row_offset = 0
102
-
103
- for column_name, column_data in dataframes.items():
104
- results = regard_tool.compute(data=column_data)["regard"]
105
- regard_dicts = [
106
- dict((x["label"], x["score"]) for x in sublist) for sublist in results
107
- ]
108
-
109
- for idx, category in enumerate(categories_order, start=1):
110
- row, col = ((idx - 1) // 2 + 1 + row_offset, (idx - 1) % 2 + 1)
111
- fig.add_trace(
112
- go.Histogram(
113
- name=f"{category} ({column_name})",
114
- x=[res_dict[category] for res_dict in regard_dicts],
115
- marker_color=category_colors[category],
116
- showlegend=False, # Disable the legend
117
- ),
118
- row=row,
119
- col=col,
120
- )
121
- row_offset += 2 # Move to the next pair of rows for the next text column
122
-
123
- subplot_height = 350
124
- total_height = (
125
- total_rows * subplot_height + 200
126
- ) # 200 for padding, titles, etc.
127
-
128
- fig.update_layout(
129
- title_text="Regard Score Histogram Distribution", height=total_height
130
- )
131
-
132
- # Specify x and y titles only for the first subplot
133
- fig.update_xaxes(title_text="Index", showticklabels=True, row=1, col=1)
134
- fig.update_yaxes(title_text="Score", showticklabels=True, row=1, col=1)
135
-
136
- # Show tick labels on all subplots
137
- for row in range(total_rows):
138
- for col in range(2): # since you have 2 columns
139
- fig.update_xaxes(showticklabels=True, row=row + 1, col=col + 1)
140
- fig.update_yaxes(showticklabels=True, row=row + 1, col=col + 1)
141
-
142
- return fig
143
-
144
- def run(self):
145
- fig = self.regard_histogram()
146
- return self.cache_results(
147
- figures=[Figure(for_object=self, key=self.key, figure=fig)]
148
- )
@@ -1,147 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- import itertools
6
- from dataclasses import dataclass
7
-
8
- import pandas as pd
9
- import plotly.graph_objects as go
10
- from rouge import Rouge
11
-
12
- from validmind.vm_models import Figure, Metric
13
-
14
-
15
- @dataclass
16
- class RougeMetrics(Metric):
17
- """
18
- Evaluates the quality of machine-generated text using various ROUGE metrics, and visualizes the results.
19
-
20
- **Purpose**: The ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a metric employed to assess the
21
- quality of machine-generated text. This evaluation technique is mainly used in natural language processing tasks,
22
- such as text summarization, machine translation, and text generation. Its goal is to measure how well the
23
- machine-generated text reflects the key information and concepts in the human-crafted reference text.
24
-
25
- **Test Mechanism**:
26
-
27
- 1. **Comparison Procedure**: The testing mechanism involves comparing machine-generated content with a reference
28
- human-constructed text.
29
-
30
- 2. **Integral Metrics**:
31
- - **ROUGE-N (N-gram Overlap)**: This evaluates the overlap of n-grams (sequences of n words) between the
32
- generated and reference texts. The common n-values are 1 (unigrams), 2 (bigrams), and 3 (trigrams). Each metric
33
- calculates precision, recall, and F1-score.
34
-
35
- - **ROUGE-L (Longest Common Subsequence)**: This identifies the longest shared word sequence in both the machine
36
- and reference texts, thus evaluating the capability of the generated text to mirror key phrases.
37
-
38
- - **ROUGE-S (Skip-bigram)**: This measures the concurrence of skip-bigrams — word pairings that appear within a
39
- predefined word window in the text. This metric maintains sensitivity to word order while allowing for sporadic
40
- word omissions.
41
-
42
- 3. **Visual Representation**: Precision, recall, and F1-score for all the metrics are visually charted, which makes
43
- the results easier to comprehend.
44
-
45
- **Signs of High Risk**:
46
-
47
- - Low scores across the suite of ROUGE metrics
48
- - Low precision might indicate redundant information in machine-produced text
49
- - Low recall may suggest the omission of important information from the reference text
50
- - Low F1 score could indicate an imbalanced performance between precision and recall
51
- - Persistent low scores could signal inherent flaws in the model
52
-
53
- **Strengths**:
54
-
55
- - Offers a multifaceted perspective on text quality using various evaluation metrics
56
- - Adapts to synonyms and rewording, thanks to n-gram-based evaluation
57
- - Encourages the retention of key word sequences using the longest common subsequence method
58
- - Visual representation of precision, recall, and F1-scores enhances understandability of model performance
59
-
60
- **Limitations**:
61
-
62
- - May fail to fully address the semantic coherence, fluency, or grammatical quality of the generated text
63
- - Tends to evaluate isolated phrases or n-grams rather than comprehensive sentences
64
- - May prove challenging when reference texts are difficult or impractical to obtain due to its reliance on
65
- comparisons with human-made references.
66
- """
67
-
68
- name = "rouge_metric"
69
- required_inputs = ["model", "dataset"]
70
- default_params = {
71
- "rouge_metrics": ["rouge-1", "rouge-2", "rouge-l"],
72
- }
73
-
74
- def run(self):
75
- r_metrics = self.params["rouge_metrics"]
76
- if r_metrics is None:
77
- raise ValueError("rouge_metrics must be provided in params")
78
-
79
- if not (
80
- set(self.default_params.get("rouge_metrics")).intersection(r_metrics)
81
- == set(r_metrics)
82
- ):
83
- raise ValueError(
84
- f"Invalid metrics from {self.default_params.get('rouge_metrics')}"
85
- )
86
-
87
- y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
88
- y_pred = self.inputs.dataset.y_pred(self.inputs.model)
89
-
90
- rouge = Rouge(metrics=r_metrics)
91
-
92
- score_list = []
93
- for y_t, y_p in zip(y_true, y_pred):
94
- scores = rouge.get_scores(y_p, y_t, avg=True)
95
- score_list.append(scores)
96
-
97
- metrics_df = pd.DataFrame(score_list)
98
- figures = []
99
-
100
- for m in metrics_df.columns:
101
- df_scores = pd.DataFrame(metrics_df[m].tolist())
102
- # Visualization part
103
- fig = go.Figure()
104
-
105
- # Adding the line plots for precision, recall, and F1-score with lines and markers
106
- fig.add_trace(
107
- go.Scatter(
108
- x=df_scores.index,
109
- y=df_scores["p"],
110
- mode="lines+markers",
111
- name="Precision",
112
- )
113
- )
114
- fig.add_trace(
115
- go.Scatter(
116
- x=df_scores.index,
117
- y=df_scores["r"],
118
- mode="lines+markers",
119
- name="Recall",
120
- )
121
- )
122
- fig.add_trace(
123
- go.Scatter(
124
- x=df_scores.index,
125
- y=df_scores["f"],
126
- mode="lines+markers",
127
- name="F1 Score",
128
- )
129
- )
130
-
131
- fig.update_layout(
132
- title=f"ROUGE Scores for {m}",
133
- xaxis_title="Row Index",
134
- yaxis_title="Score",
135
- )
136
-
137
- # Ensure a unique key for each metric
138
- k = f"{m.replace('-', '')}_{len(figures)}"
139
- figures.append(
140
- Figure(
141
- for_object=self,
142
- key=k,
143
- figure=fig,
144
- )
145
- )
146
-
147
- return self.cache_results(figures=figures)
@@ -1,133 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- import itertools
6
- from dataclasses import dataclass
7
-
8
- import pandas as pd
9
- import plotly.graph_objects as go
10
- from rouge import Rouge
11
-
12
- from validmind.vm_models import Figure, Metric
13
-
14
-
15
- @dataclass
16
- class RougeMetricsAggregate(Metric):
17
- """
18
- Evaluates the average quality of machine-generated text using various ROUGE metrics and visualizes the aggregated results.
19
-
20
- **Purpose**: The ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, remains a cornerstone for assessing
21
- machine-generated text quality. Predominantly used in tasks such as text summarization, machine translation,
22
- and text generation, the emphasis of ROUGE is to gauge the reflection of pivotal information and core concepts
23
- from human references in machine-produced content.
24
-
25
- **Test Mechanism**:
26
-
27
- 1. **Comparison Procedure**: The evaluation requires contrasting machine-rendered text against a human-made reference.
28
-
29
- 2. **Integral Metrics**:
30
- - **ROUGE-N (N-gram Overlap)**: Assesses the commonality of n-grams between both sets of texts. Regularly,
31
- metrics consider 1 (unigrams), 2 (bigrams), and 3 (trigrams), rendering precision, recall, and F1-score.
32
-
33
- - **ROUGE-L (Longest Common Subsequence)**: Discerns the lengthiest mutually inclusive word chain in both
34
- texts, ascertaining the machine text's efficacy in capturing essential phrases.
35
-
36
- - **ROUGE-S (Skip-bigram)**: Quantifies the concurrence of skip-bigrams. This metric cherishes word order
37
- but tolerates occasional omissions.
38
-
39
- 3. **Visual Representation**: The aggregate approach underscores the visualization of average scores across
40
- precision, recall, and F1-score, enhancing result interpretation.
41
-
42
- **Signs of High Risk**:
43
-
44
- - Diminished average scores across ROUGE metrics
45
- - Depressed precision may highlight verbosity in machine text
46
- - Lacking recall might hint at missed critical details from the reference
47
- - A dwindling F1 score might spotlight a disjointed precision-recall performance
48
- - Consistently low averages could reveal deep-rooted model inadequacies
49
-
50
- **Strengths**:
51
-
52
- - Provides a holistic view of text quality via diverse metrics
53
- - Gracefully handles paraphrasing owing to n-gram evaluations
54
- - Promotes the capture of crucial word chains through the longest common subsequence
55
- - Aggregate visual insights bolster comprehension of overall model behavior
56
-
57
- **Limitations**:
58
-
59
- - Might overlook nuances like semantic integrity, fluency, or syntactic correctness
60
- - Focuses more on discrete phrases or n-grams over holistic sentences
61
- - Reliance on human references can be limiting when they're hard to source or infeasible.
62
- """
63
-
64
- name = "rouge_metrics_aggregate"
65
- required_inputs = ["model", "dataset"]
66
- default_params = {
67
- "rouge_metrics": ["rouge-1", "rouge-2", "rouge-l"],
68
- }
69
-
70
- def run(self):
71
- r_metrics = self.params["rouge_metrics"]
72
- if r_metrics is None:
73
- raise ValueError("rouge_metrics must be provided in params")
74
-
75
- if not (
76
- set(self.default_params.get("rouge_metrics")).intersection(r_metrics)
77
- == set(r_metrics)
78
- ):
79
- raise ValueError(
80
- f"Invalid metrics from {self.default_params.get('rouge_metrics')}"
81
- )
82
-
83
- y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
84
- y_pred = self.inputs.dataset.y_pred(self.inputs.model)
85
-
86
- rouge = Rouge(metrics=r_metrics)
87
-
88
- score_list = []
89
- for y_t, y_p in zip(y_true, y_pred):
90
- scores = rouge.get_scores(y_p, y_t, avg=True)
91
- score_list.append(scores)
92
-
93
- metrics_df = pd.DataFrame(score_list)
94
- figures = []
95
-
96
- colors = {"Precision": "blue", "Recall": "green", "F1 Score": "red"}
97
- mapping = {"p": "Precision", "r": "Recall", "f": "F1 Score"}
98
-
99
- for m in metrics_df.columns:
100
- df_scores = pd.DataFrame(metrics_df[m].tolist())
101
- avg_scores = df_scores.mean()
102
-
103
- # Visualization part
104
- fig = go.Figure()
105
-
106
- # Adding the bar plots for average scores with specified colors
107
- for metric_short, metric_full in mapping.items():
108
- fig.add_trace(
109
- go.Bar(
110
- x=[metric_full],
111
- y=[avg_scores[metric_short]],
112
- name=metric_full,
113
- marker_color=colors[metric_full],
114
- showlegend=False,
115
- )
116
- )
117
-
118
- fig.update_layout(
119
- title=f"Average ROUGE Scores for {m}",
120
- xaxis_title="Metric",
121
- yaxis_title="Average Score",
122
- )
123
-
124
- k = m.replace("-", "")
125
- figures.append(
126
- Figure(
127
- for_object=self,
128
- key=k,
129
- figure=fig,
130
- )
131
- )
132
-
133
- return self.cache_results(figures=figures)
@@ -1,112 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- import itertools
6
- from dataclasses import dataclass
7
-
8
- import pandas as pd
9
- import plotly.graph_objects as go
10
- import torch
11
- from selfcheckgpt.modeling_selfcheck import SelfCheckNLI
12
- from tqdm import tqdm
13
-
14
- from validmind.vm_models import Figure, Metric
15
-
16
-
17
- @dataclass
18
- class SelfCheckNLIScore(Metric):
19
- """
20
- Evaluates text generation models' performance by quantifying the level of hallucination in generated texts compared to reference texts.
21
-
22
- **Purpose**: The HallucinationScore metric is designed to assess the factual accuracy and reliability of text generated by models, focusing on the detection and quantification of hallucinations—instances where generated content deviates from factual or expected outputs. By comparing generated texts against reference texts, this metric highlights discrepancies indicative of hallucinations, offering insights into the model's ability to produce contextually and factually coherent content.
23
-
24
- **Test Mechanism**: To compute the HallucinationScore, the metric employs a comparison between the generated texts (model predictions) and the provided reference texts (true values). Using the SelfCheckNLI model, it evaluates each generated text's level of factual congruence with the reference, assigning a hallucination score based on the semantic coherence and factual accuracy. The scores for each text instance are then visualized in a line plot, allowing for the examination of hallucination trends across the dataset.
25
-
26
- **Signs of High Risk**:
27
- - High hallucination scores across a significant portion of the dataset, indicating a prevalence of factually inaccurate or irrelevant content generation.
28
- - Patterns of consistent hallucination in specific contexts or subjects, suggesting gaps in the model's understanding or knowledge.
29
- - Sharp fluctuations in hallucination scores, which may reveal inconsistencies in the model's performance or sensitivity to certain types of input.
30
-
31
- **Strengths**:
32
- - Directly addresses the critical aspect of factual accuracy in generated text, beyond mere linguistic or stylistic coherence.
33
- - Provides a granular, instance-by-instance analysis of model performance, allowing for targeted improvements and diagnostics.
34
- - Facilitates a deeper understanding of a model's capabilities and limitations in producing reliable and accurate content.
35
-
36
- **Limitations**:
37
- - Reliance on the SelfCheckNLI model means the accuracy and effectiveness of the HallucinationScore are contingent upon the performance and suitability of the underlying NLI model.
38
- - May not fully capture the subtleties of certain factual inaccuracies or the contextual relevance of reference texts, especially in complex or nuanced domains.
39
- - Potentially resource-intensive, given the computational demands of running advanced NLI models for large datasets.
40
- """
41
-
42
- name = "self_check_nli_score"
43
- required_inputs = ["model", "dataset"]
44
-
45
- def run(self):
46
- # Assuming the dataset is structured with generated sentences and reference samples
47
- y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
48
- y_pred = self.inputs.dataset.y_pred(self.inputs.model)
49
-
50
- hallucination_scores = self.compute_hallucination_scores(y_pred, y_true)
51
-
52
- # Visualization of scores
53
- figures = self.visualize_scores(hallucination_scores)
54
-
55
- return self.cache_results(figures=figures)
56
-
57
- def compute_hallucination_scores(self, predictions, references):
58
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
59
- selfcheck_nli = SelfCheckNLI(device=device)
60
- hallucination_scores = []
61
-
62
- print("Starting hallucination score computation...")
63
-
64
- for index, (sentences, samples) in enumerate(
65
- tqdm(zip(predictions, references), total=len(predictions))
66
- ):
67
- sent_scores_nli = selfcheck_nli.predict(
68
- sentences=sentences, sampled_passages=samples
69
- )
70
-
71
- # Compute the mean of the hallucination scores for this row
72
- average_score = sent_scores_nli.mean()
73
- hallucination_scores.append(average_score)
74
-
75
- # Print a progress update for each row
76
- print(
77
- f"Row {index + 1}/{len(predictions)}: Average hallucination score: {average_score}"
78
- )
79
-
80
- print("Completed hallucination score computation.")
81
-
82
- return hallucination_scores
83
-
84
- def visualize_scores(self, scores):
85
- scores_df = pd.DataFrame(scores, columns=["Hallucination Score"])
86
-
87
- fig = go.Figure()
88
- fig.add_trace(
89
- go.Scatter(
90
- x=scores_df.index,
91
- y=scores_df["Hallucination Score"],
92
- mode="lines+markers",
93
- name="Hallucination Score",
94
- )
95
- )
96
-
97
- fig.update_layout(
98
- title="Hallucination Scores Across Text Instances",
99
- xaxis_title="Text Instance Index",
100
- yaxis_title="Hallucination Score",
101
- )
102
-
103
- # Wrapping the plotly figure for compatibility with your framework might be needed
104
- figures = [
105
- Figure(
106
- for_object=self,
107
- key=self.key,
108
- figure=fig,
109
- )
110
- ]
111
-
112
- return figures
@@ -1,136 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- import itertools
6
- from dataclasses import dataclass
7
-
8
- import evaluate
9
- import pandas as pd
10
- import plotly.graph_objects as go
11
- import plotly.subplots as sp
12
-
13
- from validmind.vm_models import Figure, Metric
14
-
15
-
16
- @dataclass
17
- class ToxicityHistogram(Metric):
18
- """
19
- **Purpose:**
20
- The ToxicityHistogram metric visualizes and analyzes the toxicity scores of various texts. Through histograms, it
21
- provides insights into the distribution and nature of toxicity present in the evaluated text segments.
22
-
23
- **Test Mechanism:**
24
- Texts are fetched from specified columns and their toxicity scores are computed using a preloaded `toxicity`
25
- evaluation tool. Each text data column is visualized with its own histogram, culminating in a multi-panel
26
- visualization.
27
-
28
- **Signs of High Risk:**
29
- High toxicity concentrations in the histogram, especially on the upper scale, signify a higher presence of toxic
30
- content in the respective text segment. If predicted summaries show significantly differing patterns from input or
31
- target texts, it could indicate issues with the model's output.
32
-
33
- **Strengths:**
34
- The metric offers a lucid representation of toxicity distributions, facilitating the swift identification of
35
- concerning patterns. It's instrumental for gauging potential pitfalls of generated content, particularly in the
36
- realm of predicted summaries.
37
-
38
- **Limitations:**
39
- The ToxicityHistogram's efficacy hinges on the accuracy of the `toxicity` tool it employs. While histograms depict
40
- distribution patterns, they omit details about which specific text portions or tokens result in high toxicity
41
- scores. Therefore, for a comprehensive understanding, more in-depth analysis might be requisite.
42
- """
43
-
44
- name = "toxicity_histogram"
45
- required_inputs = ["model"]
46
- metadata = {
47
- "task_types": [
48
- "text_classification",
49
- "text_summarization",
50
- ],
51
- "tags": ["toxicity_histogram"],
52
- }
53
-
54
- def _get_datasets(self):
55
- # Check model attributes
56
- if not hasattr(self, "model"):
57
- raise AttributeError("The 'model' attribute is missing.")
58
-
59
- y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
60
- y_pred = self.inputs.dataset.y_pred(self.inputs.model)
61
- input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
62
-
63
- # Ensure consistency in lengths
64
- if not len(y_true) == len(y_pred) == len(input_text):
65
- raise ValueError(
66
- "Inconsistent lengths among input text, true summaries, and predicted summaries."
67
- )
68
-
69
- return input_text, y_true, y_pred
70
-
71
- def toxicity_histograms(self, df):
72
- """
73
- Compute toxicity scores for texts and then plot histograms for all columns of df.
74
-
75
- Parameters:
76
- - df (pd.DataFrame): The dataframe containing texts.
77
- """
78
-
79
- # Extract necessary parameters
80
- toxicity = evaluate.load("toxicity")
81
-
82
- # Get all columns of df
83
- text_columns = df.columns.tolist()
84
-
85
- # Determine the number of rows required based on the number of text columns
86
- num_rows = (len(text_columns) + 1) // 2 # +1 to handle odd number of columns
87
-
88
- # Create a subplot layout
89
- fig = sp.make_subplots(rows=num_rows, cols=2, subplot_titles=text_columns)
90
-
91
- subplot_height = 350 # Height of each subplot
92
- total_height = num_rows * subplot_height + 200 # 200 for padding, titles, etc.
93
-
94
- for idx, col in enumerate(text_columns, start=1):
95
- row = (idx - 1) // 2 + 1
96
- col_idx = (idx - 1) % 2 + 1 # to place subplots in two columns
97
-
98
- # Get list of texts from dataframe
99
- texts = df[col].tolist()
100
-
101
- # Compute toxicity for texts
102
- toxicity_scores = toxicity.compute(predictions=texts)["toxicity"]
103
-
104
- # Add traces to the corresponding subplot without legend
105
- fig.add_trace(
106
- go.Histogram(x=toxicity_scores, showlegend=False), row=row, col=col_idx
107
- )
108
-
109
- # Update xaxes and yaxes titles only for the first subplot
110
- if idx == 1:
111
- fig.update_xaxes(title_text="Toxicity Score", row=row, col=col_idx)
112
- fig.update_yaxes(title_text="Frequency", row=row, col=col_idx)
113
-
114
- # Update layout
115
- fig.update_layout(
116
- title_text="Histograms of Toxicity Scores", height=total_height
117
- )
118
-
119
- return fig
120
-
121
- def run(self):
122
- input_text, y_true, y_pred = self._get_datasets()
123
-
124
- df = pd.DataFrame(
125
- {
126
- "Input Text": input_text,
127
- "Target Text": y_true,
128
- "Predicted Summaries": y_pred,
129
- }
130
- )
131
-
132
- fig = self.toxicity_histograms(df)
133
-
134
- return self.cache_results(
135
- figures=[Figure(for_object=self, key=self.key, figure=fig)]
136
- )