validmind 2.1.1__py3-none-any.whl → 2.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai.py +72 -49
  3. validmind/api_client.py +42 -16
  4. validmind/client.py +68 -25
  5. validmind/datasets/llm/rag/__init__.py +11 -0
  6. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
  7. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
  8. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
  9. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
  10. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
  11. validmind/datasets/llm/rag/rfp.py +41 -0
  12. validmind/errors.py +1 -1
  13. validmind/html_templates/__init__.py +0 -0
  14. validmind/html_templates/content_blocks.py +89 -14
  15. validmind/models/__init__.py +7 -4
  16. validmind/models/foundation.py +8 -34
  17. validmind/models/function.py +51 -0
  18. validmind/models/huggingface.py +16 -46
  19. validmind/models/metadata.py +42 -0
  20. validmind/models/pipeline.py +66 -0
  21. validmind/models/pytorch.py +8 -42
  22. validmind/models/r_model.py +33 -82
  23. validmind/models/sklearn.py +39 -38
  24. validmind/template.py +8 -26
  25. validmind/tests/__init__.py +43 -20
  26. validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
  27. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
  28. validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
  29. validmind/tests/data_validation/Duplicates.py +1 -1
  30. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  31. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
  32. validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
  33. validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
  34. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
  35. validmind/tests/data_validation/nlp/Punctuations.py +11 -12
  36. validmind/tests/data_validation/nlp/Sentiment.py +57 -0
  37. validmind/tests/data_validation/nlp/Toxicity.py +45 -0
  38. validmind/tests/decorator.py +12 -7
  39. validmind/tests/model_validation/BertScore.py +100 -98
  40. validmind/tests/model_validation/BleuScore.py +93 -64
  41. validmind/tests/model_validation/ContextualRecall.py +74 -91
  42. validmind/tests/model_validation/MeteorScore.py +86 -74
  43. validmind/tests/model_validation/RegardScore.py +103 -121
  44. validmind/tests/model_validation/RougeScore.py +118 -0
  45. validmind/tests/model_validation/TokenDisparity.py +84 -121
  46. validmind/tests/model_validation/ToxicityScore.py +109 -123
  47. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
  48. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
  49. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
  50. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
  51. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
  52. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
  53. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
  54. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
  55. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
  56. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
  57. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
  58. validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
  59. validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
  60. validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
  61. validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
  62. validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
  63. validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
  64. validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
  65. validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
  66. validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
  67. validmind/tests/model_validation/ragas/utils.py +66 -0
  68. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
  69. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
  70. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
  71. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
  72. validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
  73. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  74. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
  75. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -11
  76. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
  77. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
  78. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
  79. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  80. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  81. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  82. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
  83. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  84. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
  85. validmind/unit_metrics/__init__.py +26 -49
  86. validmind/unit_metrics/composite.py +13 -7
  87. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
  88. validmind/utils.py +99 -6
  89. validmind/vm_models/__init__.py +1 -1
  90. validmind/vm_models/dataset/__init__.py +7 -0
  91. validmind/vm_models/dataset/dataset.py +560 -0
  92. validmind/vm_models/dataset/utils.py +146 -0
  93. validmind/vm_models/model.py +97 -72
  94. validmind/vm_models/test/metric.py +9 -24
  95. validmind/vm_models/test/result_wrapper.py +124 -28
  96. validmind/vm_models/test/threshold_test.py +10 -28
  97. validmind/vm_models/test_context.py +1 -1
  98. validmind/vm_models/test_suite/summary.py +3 -4
  99. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/METADATA +5 -3
  100. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/RECORD +103 -78
  101. validmind/models/catboost.py +0 -33
  102. validmind/models/statsmodels.py +0 -50
  103. validmind/models/xgboost.py +0 -30
  104. validmind/tests/model_validation/BertScoreAggregate.py +0 -90
  105. validmind/tests/model_validation/RegardHistogram.py +0 -148
  106. validmind/tests/model_validation/RougeMetrics.py +0 -147
  107. validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
  108. validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
  109. validmind/tests/model_validation/ToxicityHistogram.py +0 -136
  110. validmind/vm_models/dataset.py +0 -1303
  111. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/LICENSE +0 -0
  112. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/WHEEL +0 -0
  113. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/entry_points.txt +0 -0
@@ -2,91 +2,103 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import evaluate
8
6
  import pandas as pd
9
7
  import plotly.graph_objects as go
10
8
 
11
- from validmind.vm_models import Figure, Metric
9
+ from validmind import tags, tasks
12
10
 
13
11
 
14
- @dataclass
15
- class MeteorScore(Metric):
12
+ @tags("nlp", "text_data", "visualization")
13
+ @tasks("text_classification", "text_summarization")
14
+ def MeteorScore(dataset, model):
16
15
  """
17
16
  Computes and visualizes the METEOR score for each text generation instance, assessing translation quality.
18
17
 
19
- **Purpose**: METEOR (Metric for Evaluation of Translation with Explicit ORdering) is designed to evaluate the quality
20
- of machine translations by comparing them against reference translations. It emphasizes both the accuracy and fluency
21
- of translations, incorporating precision, recall, and word order into its assessment.
22
-
23
- **Test Mechanism**: The METEOR score is computed for each pair of machine-generated translation (prediction) and its
24
- corresponding human-produced reference. This is done by considering unigram matches between the translations, including
25
- matches based on surface forms, stemmed forms, and synonyms. The score is a combination of unigram precision and recall,
26
- adjusted for word order through a fragmentation penalty.
27
-
28
- **Signs of High Risk**:
29
- - Lower METEOR scores can indicate a lack of alignment between the machine-generated translations and their human-produced references, highlighting potential deficiencies in both the accuracy and fluency of translations.
30
- - Significant discrepancies in word order or an excessive fragmentation penalty could signal issues with how the translation model processes and reconstructs sentence structures, potentially compromising the natural flow of translated text.
31
- - Persistent underperformance across a variety of text types or linguistic contexts might suggest a broader inability of the model to adapt to the nuances of different languages or dialects, pointing towards gaps in its training or inherent limitations.
32
-
33
- **Strengths**:
34
- - Incorporates a balanced consideration of precision and recall, weighted towards recall to reflect the importance of
35
- content coverage in translations.
18
+ **Purpose:**
19
+ METEOR (Metric for Evaluation of Translation with Explicit ORdering) is designed to evaluate the quality of machine translations
20
+ by comparing them against reference translations. It emphasizes both the accuracy and fluency of translations, incorporating
21
+ precision, recall, and word order into its assessment.
22
+
23
+ **Test Mechanism:**
24
+ The function starts by extracting the true and predicted values from the provided dataset and model. The METEOR score is computed
25
+ for each pair of machine-generated translation (prediction) and its corresponding human-produced reference. This is done by
26
+ considering unigram matches between the translations, including matches based on surface forms, stemmed forms, and synonyms.
27
+ The score is a combination of unigram precision and recall, adjusted for word order through a fragmentation penalty. Scores are
28
+ compiled into a dataframe, and histograms and bar charts are generated to visualize the distribution of METEOR scores. Additionally,
29
+ a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the METEOR scores,
30
+ providing a comprehensive summary of the model's performance.
31
+
32
+ **Signs of High Risk:**
33
+ - Lower METEOR scores can indicate a lack of alignment between the machine-generated translations and their human-produced references,
34
+ highlighting potential deficiencies in both the accuracy and fluency of translations.
35
+ - Significant discrepancies in word order or an excessive fragmentation penalty could signal issues with how the translation model processes
36
+ and reconstructs sentence structures, potentially compromising the natural flow of translated text.
37
+ - Persistent underperformance across a variety of text types or linguistic contexts might suggest a broader inability of the model to adapt to the
38
+ nuances of different languages or dialects, pointing towards gaps in its training or inherent limitations.
39
+
40
+ **Strengths:**
41
+ - Incorporates a balanced consideration of precision and recall, weighted towards recall to reflect the importance of content coverage in translations.
36
42
  - Directly accounts for word order, offering a nuanced evaluation of translation fluency beyond simple lexical matching.
37
43
  - Adapts to various forms of lexical similarity, including synonyms and stemmed forms, allowing for flexible matching.
38
44
 
39
- **Limitations**:
40
- - While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for
41
- large datasets.
42
- - The use of external resources for synonym and stemming matching may introduce variability based on the resources'
43
- quality and relevance to the specific translation task.
45
+ **Limitations:**
46
+ - While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for large datasets.
47
+ - The use of external resources for synonym and stemming matching may introduce variability based on the resources' quality and relevance to the specific
48
+ translation task.
44
49
  """
45
50
 
46
- name = "meteor_score"
47
- required_inputs = ["model", "dataset"]
48
-
49
- def run(self):
50
- # Load the METEOR metric
51
- meteor = evaluate.load("meteor")
52
-
53
- # Initialize a list to hold METEOR scores
54
- meteor_scores = []
55
-
56
- for prediction, reference in zip(
57
- self.inputs.dataset.y_pred(self.inputs.model),
58
- self.inputs.dataset.y,
59
- ):
60
- # Compute the METEOR score for the current prediction-reference pair
61
- result = meteor.compute(predictions=[prediction], references=[reference])
62
- meteor_scores.append(result["meteor"])
63
-
64
- # Visualization of METEOR scores
65
- figures = self.visualize_scores(meteor_scores)
66
-
67
- return self.cache_results(figures=figures)
68
-
69
- def visualize_scores(self, scores):
70
- # Convert the scores list to a DataFrame for plotting
71
- scores_df = pd.DataFrame(scores, columns=["METEOR Score"])
72
-
73
- # Create a line plot of the METEOR scores
74
- fig = go.Figure()
75
- fig.add_trace(
76
- go.Scatter(
77
- x=scores_df.index,
78
- y=scores_df["METEOR Score"],
79
- mode="lines+markers",
80
- name="METEOR Score",
81
- )
82
- )
83
- fig.update_layout(
84
- title="METEOR Scores Across Text Instances",
85
- xaxis_title="Text Instance Index",
86
- yaxis_title="METEOR Score",
87
- )
88
-
89
- # Wrap the Plotly figure for compatibility with your framework
90
- figures = [Figure(for_object=self, key=self.key, figure=fig)]
91
-
92
- return figures
51
+ # Extract true and predicted values
52
+ y_true = dataset.y
53
+ y_pred = dataset.y_pred(model)
54
+
55
+ # Load the METEOR evaluation metric
56
+ meteor = evaluate.load("meteor")
57
+
58
+ # Calculate METEOR scores
59
+ score_list = []
60
+ for y_t, y_p in zip(y_true, y_pred):
61
+ # Compute the METEOR score
62
+ score = meteor.compute(predictions=[y_p], references=[y_t])
63
+ score_list.append(score["meteor"])
64
+
65
+ # Convert scores to a dataframe
66
+ metrics_df = pd.DataFrame(score_list, columns=["METEOR Score"])
67
+
68
+ figures = []
69
+
70
+ # Histogram for METEOR Score
71
+ hist_fig = go.Figure(data=[go.Histogram(x=metrics_df["METEOR Score"])])
72
+ hist_fig.update_layout(
73
+ title="METEOR Score Histogram",
74
+ xaxis_title="METEOR Score",
75
+ yaxis_title="Count",
76
+ )
77
+ figures.append(hist_fig)
78
+
79
+ # Bar Chart for METEOR Score
80
+ bar_fig = go.Figure(data=[go.Bar(x=metrics_df.index, y=metrics_df["METEOR Score"])])
81
+ bar_fig.update_layout(
82
+ title="METEOR Score Bar Chart",
83
+ xaxis_title="Row Index",
84
+ yaxis_title="METEOR Score",
85
+ )
86
+ figures.append(bar_fig)
87
+
88
+ # Calculate statistics for METEOR Score
89
+ stats_df = metrics_df.describe().loc[["mean", "50%", "max", "min", "std"]]
90
+ stats_df = stats_df.rename(
91
+ index={
92
+ "mean": "Mean Score",
93
+ "50%": "Median Score",
94
+ "max": "Max Score",
95
+ "min": "Min Score",
96
+ "std": "Standard Deviation",
97
+ }
98
+ ).T
99
+ stats_df["Count"] = len(metrics_df)
100
+
101
+ # Create a DataFrame from all collected statistics
102
+ result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
103
+
104
+ return (result_df, *tuple(figures))
@@ -2,142 +2,124 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import itertools
6
- from dataclasses import dataclass
7
-
8
5
  import evaluate
6
+ import pandas as pd
9
7
  import plotly.graph_objects as go
10
- from plotly.subplots import make_subplots
11
8
 
12
- from validmind.vm_models import Figure, Metric
9
+ from validmind import tags, tasks
13
10
 
14
11
 
15
- @dataclass
16
- class RegardScore(Metric):
12
+ @tags("nlp", "text_data", "visualization")
13
+ @tasks("text_classification", "text_summarization")
14
+ def RegardScore(dataset, model):
17
15
  """
16
+ Computes and visualizes the regard score for each text instance, assessing sentiment and potential biases.
17
+
18
18
  **Purpose:**
19
- The `RegardScore` metric assesses the degree of regard—positive, negative, neutral, or other—present in the given text,
20
- whether it's a classification or summarization result. Especially crucial for applications like sentiment analysis,
21
- product reviews, or opinion mining, it provides a granular understanding of how the model perceives or generates content
22
- in terms of favorability or sentiment.
19
+ The `RegardScore` metric is designed to evaluate the regard levels (positive, negative, neutral, or other) of texts generated by models. This helps in understanding the sentiment and biases in the generated content.
23
20
 
24
21
  **Test Mechanism:**
25
- The metric ingests data primarily from the model's test dataset, extracting the input text, target text (true regard),
26
- and the model's predicted regard. These elements undergo a series of consistency checks before being processed. Using
27
- the `evaluate.load("regard")` tool, regard scores are computed for each segment of text. The results are then visualized
28
- in a multi-subplot line graph, where each subplot corresponds to a particular category of regard (e.g., positive, negative,
29
- neutral, other) against the input, target, and predicted texts.
22
+ The function starts by extracting the true and predicted values from the provided dataset and model. The regard scores are computed for each text using a preloaded `regard` evaluation tool. The scores are compiled into dataframes, and histograms and bar charts are generated to visualize the distribution of regard scores. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the regard scores, providing a comprehensive summary of the model's performance.
30
23
 
31
24
  **Signs of High Risk:**
32
- Disparities between the target regard scores and the predicted regard scores may signify potential flaws or biases in
33
- the model. For instance, if neutral inputs are consistently perceived as strongly positive or negative, this could
34
- indicate the model's inability to correctly identify or generate balanced sentiments.
25
+ - Noticeable skewness in the histogram, especially when comparing the predicted regard scores with the target regard scores, could indicate biases or inconsistencies in the model.
26
+ - Lack of neutral scores in the model's predictions, despite a balanced distribution in the target data, might signal an issue.
35
27
 
36
28
  **Strengths:**
37
- The metric's visual presentation, using line plots, provides an intuitive way to comprehend the model's regard assessment
38
- across different text samples and regard categories. The color-coded lines associated with each regard category further
39
- enhance the clarity of the comparison, making it simpler for stakeholders or researchers to infer the model's performance.
29
+ - Provides a clear evaluation of regard levels in generated texts, helping to ensure content appropriateness.
30
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of regard scores.
31
+ - Descriptive statistics offer a concise summary of the model's performance in generating texts with balanced sentiments.
40
32
 
41
33
  **Limitations:**
42
- The `RegardScoreHistogram` metric emphasizes regard scores but may not always grasp intricate nuances or the true context
43
- of texts. Its reliance on underlying tools, which might be trained on potentially biased datasets, can result in skewed
44
- interpretations. Additionally, while the metric segments regard into discrete categories such as "positive" and "negative,"
45
- real-world sentiments often exist on a more complex spectrum. The metric's efficacy is intertwined with the accuracy of
46
- the model's predictions; any inherent model inaccuracies can impact the metric's reflection of true sentiments.
34
+ - The accuracy of the regard scores is contingent upon the underlying `regard` tool.
35
+ - The scores provide a broad overview but do not specify which portions or tokens of the text are responsible for high regard.
36
+ - Supplementary, in-depth analysis might be needed for granular insights.
47
37
  """
48
38
 
49
- name = "regard_score"
50
- required_inputs = ["model", "dataset"]
51
- metadata = {
52
- "task_types": ["text_classification", "text_summarization"],
53
- "tags": ["regard_score"],
54
- }
55
-
56
- def _get_datasets(self):
57
- if not hasattr(self, "model"):
58
- raise AttributeError("The 'model' attribute is missing.")
59
-
60
- y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
61
- y_pred = self.inputs.dataset.y_pred(self.inputs.model)
62
-
63
- if not len(y_true) == len(y_pred):
64
- raise ValueError(
65
- "Inconsistent lengths among input text, true summaries, and predicted summaries."
39
+ # Extract true and predicted values
40
+ y_true = dataset.y
41
+ y_pred = dataset.y_pred(model)
42
+
43
+ # Load the regard evaluation metric
44
+ regard_tool = evaluate.load("regard")
45
+
46
+ # Function to calculate regard scores
47
+ def compute_regard_scores(texts):
48
+ scores = regard_tool.compute(data=texts)["regard"]
49
+ regard_dicts = [
50
+ dict((x["label"], x["score"]) for x in sublist) for sublist in scores
51
+ ]
52
+ return regard_dicts
53
+
54
+ # Calculate regard scores for true and predicted texts
55
+ true_regard = compute_regard_scores(y_true)
56
+ pred_regard = compute_regard_scores(y_pred)
57
+
58
+ # Convert scores to dataframes
59
+ true_df = pd.DataFrame(true_regard)
60
+ pred_df = pd.DataFrame(pred_regard)
61
+
62
+ figures = []
63
+
64
+ # Function to create histogram and bar chart for regard scores
65
+ def create_figures(df, title):
66
+ for category in df.columns:
67
+ # Histogram
68
+ hist_fig = go.Figure(data=[go.Histogram(x=df[category])])
69
+ hist_fig.update_layout(
70
+ title=f"{title} - {category.capitalize()} Histogram",
71
+ xaxis_title=category.capitalize(),
72
+ yaxis_title="Count",
66
73
  )
67
-
68
- return y_true, y_pred
69
-
70
- def regard_line_plot(self):
71
- regard_tool = evaluate.load("regard")
72
- y_true, y_pred = self._get_datasets()
73
-
74
- dataframes = {
75
- "Target Text": y_true,
76
- "Predicted Summaries": y_pred,
77
- }
78
-
79
- total_text_columns = len(dataframes)
80
- total_rows = total_text_columns * 2
81
-
82
- categories_order = ["positive", "negative", "neutral", "other"]
83
- category_colors = {
84
- "negative": "#d9534f",
85
- "neutral": "#5bc0de",
86
- "other": "#f0ad4e",
87
- "positive": "#5cb85c",
88
- }
89
-
90
- fig = make_subplots(
91
- rows=total_rows,
92
- cols=2,
93
- subplot_titles=[
94
- f"{col_name} {cat}"
95
- for col_name in dataframes
96
- for cat in categories_order
97
- ],
98
- shared_yaxes=True,
99
- vertical_spacing=0.1,
100
- )
101
-
102
- row_offset = 0
103
- for column_name, column_data in dataframes.items():
104
- results = regard_tool.compute(data=column_data)["regard"]
105
- regard_dicts = [
106
- dict((x["label"], x["score"]) for x in sublist) for sublist in results
107
- ]
108
-
109
- for idx, category in enumerate(categories_order, start=1):
110
- row, col = ((idx - 1) // 2 + 1 + row_offset, (idx - 1) % 2 + 1)
111
- scores = [res_dict[category] for res_dict in regard_dicts]
112
- fig.add_trace(
113
- go.Scatter(
114
- name=f"{category} ({column_name})",
115
- x=list(range(len(column_data))),
116
- y=scores,
117
- mode="lines+markers",
118
- marker=dict(size=5),
119
- hoverinfo="y+name",
120
- line=dict(color=category_colors[category], width=1.5),
121
- showlegend=False,
122
- ),
123
- row=row,
124
- col=col,
125
- )
126
- row_offset += 2
127
-
128
- subplot_height = 350
129
- total_height = total_rows * subplot_height + 200
130
-
131
- fig.update_layout(title_text="Regard Scores", height=total_height)
132
- fig.update_yaxes(range=[0, 1])
133
- fig.update_xaxes(showticklabels=False, row=1, col=1)
134
- fig.update_xaxes(title_text="Index", showticklabels=True, row=1, col=1)
135
- fig.update_yaxes(title_text="Score", showticklabels=True, row=1, col=1)
136
-
137
- return fig
138
-
139
- def run(self):
140
- fig = self.regard_line_plot()
141
- return self.cache_results(
142
- figures=[Figure(for_object=self, key=self.key, figure=fig)]
143
- )
74
+ figures.append(hist_fig)
75
+
76
+ # Bar Chart
77
+ bar_fig = go.Figure(data=[go.Bar(x=df.index, y=df[category])])
78
+ bar_fig.update_layout(
79
+ title=f"{title} - {category.capitalize()} Bar Chart",
80
+ xaxis_title="Text Instance Index",
81
+ yaxis_title=category.capitalize(),
82
+ )
83
+ figures.append(bar_fig)
84
+
85
+ # Create figures for each regard score dataframe
86
+ create_figures(true_df, "True Text Regard")
87
+ create_figures(pred_df, "Predicted Text Regard")
88
+
89
+ # Calculate statistics for each regard score dataframe
90
+ def calculate_stats(df, metric_name):
91
+ stats = df.describe().loc[["mean", "50%", "max", "min", "std"]].T
92
+ stats.columns = [
93
+ "Mean Score",
94
+ "Median Score",
95
+ "Max Score",
96
+ "Min Score",
97
+ "Standard Deviation",
98
+ ]
99
+ stats["Metric"] = metric_name
100
+ stats["Count"] = len(df)
101
+ return stats
102
+
103
+ true_stats = calculate_stats(true_df, "True Text Regard")
104
+ pred_stats = calculate_stats(pred_df, "Predicted Text Regard")
105
+
106
+ # Combine statistics into a single dataframe
107
+ result_df = (
108
+ pd.concat([true_stats, pred_stats])
109
+ .reset_index()
110
+ .rename(columns={"index": "Category"})
111
+ )
112
+ result_df = result_df[
113
+ [
114
+ "Metric",
115
+ "Category",
116
+ "Mean Score",
117
+ "Median Score",
118
+ "Max Score",
119
+ "Min Score",
120
+ "Standard Deviation",
121
+ "Count",
122
+ ]
123
+ ]
124
+
125
+ return (result_df, *tuple(figures))
@@ -0,0 +1,118 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import pandas as pd
6
+ import plotly.graph_objects as go
7
+ from rouge import Rouge
8
+
9
+ from validmind import tags, tasks
10
+
11
+
12
+ @tags("nlp", "text_data", "visualization")
13
+ @tasks("text_classification", "text_summarization")
14
+ def RougeScore(dataset, model, metric="rouge-1"):
15
+ """
16
+ Evaluates the quality of machine-generated text using ROUGE metrics and visualizes the results through histograms
17
+ and bar charts, alongside compiling a comprehensive table of descriptive statistics for each ROUGE metric.
18
+
19
+ **Purpose:**
20
+ This function is designed to assess the quality of text generated by machine learning models using various ROUGE metrics.
21
+ ROUGE, which stands for Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics used to evaluate the
22
+ overlap of n-grams, word sequences, and word pairs between the machine-generated text and reference texts. This evaluation
23
+ is crucial for tasks such as text summarization, machine translation, and text generation, where the goal is to produce text
24
+ that accurately reflects the content and meaning of human-crafted references.
25
+
26
+ **Test Mechanism:**
27
+ The function starts by extracting the true and predicted values from the provided dataset and model. It then initializes the ROUGE
28
+ evaluator with the specified metric (e.g., ROUGE-1). For each pair of true and predicted texts, the function calculates the ROUGE
29
+ scores and compiles them into a dataframe. Histograms and bar charts are generated for each ROUGE metric (Precision, Recall, and F1 Score)
30
+ to visualize their distribution. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum)
31
+ is compiled for each metric, providing a comprehensive summary of the model's performance.
32
+
33
+ **Signs of High Risk:**
34
+
35
+ - Consistently low scores across ROUGE metrics could indicate poor quality in the generated text, suggesting that the model fails
36
+ to capture the essential content of the reference texts.
37
+ - Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
38
+ - Low recall scores may indicate that important information from the reference text is being omitted.
39
+ - An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the model's ability
40
+ to balance informativeness and conciseness.
41
+
42
+ **Strengths:**
43
+
44
+ - Provides a multifaceted evaluation of text quality through different ROUGE metrics, offering a detailed view of model performance.
45
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the scores.
46
+ - Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
47
+
48
+ **Limitations:**
49
+
50
+ - ROUGE metrics primarily focus on n-gram overlap and may not fully capture semantic coherence, fluency, or grammatical quality of the text.
51
+ - The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
52
+ - While useful for comparison, ROUGE scores alone do not provide a complete assessment of a model's performance and should be
53
+ supplemented with other metrics and qualitative analysis.
54
+ """
55
+
56
+ # Extract true and predicted values
57
+ y_true = dataset.y
58
+ y_pred = dataset.y_pred(model)
59
+
60
+ # Initialize Rouge with the specified metric
61
+ rouge = Rouge(metrics=[metric])
62
+
63
+ # Calculate ROUGE scores
64
+ score_list = []
65
+ for y_t, y_p in zip(y_true, y_pred):
66
+ scores = rouge.get_scores(y_p, y_t, avg=True)
67
+ score_list.append(scores)
68
+
69
+ # Convert scores to a dataframe
70
+ metrics_df = pd.DataFrame(score_list)
71
+ df_scores = pd.DataFrame(metrics_df[metric].tolist())
72
+
73
+ # Generate histograms and bar charts for each score type
74
+ score_types = ["p", "r", "f"]
75
+ score_names = ["Precision", "Recall", "F1 Score"]
76
+ figures = []
77
+
78
+ for score_type, score_name in zip(score_types, score_names):
79
+ # Histogram
80
+ hist_fig = go.Figure(data=[go.Histogram(x=df_scores[score_type])])
81
+ hist_fig.update_layout(
82
+ title=f"{score_name} Histogram for {metric.upper()}",
83
+ xaxis_title=score_name,
84
+ yaxis_title="Count",
85
+ )
86
+ figures.append(hist_fig)
87
+
88
+ # Bar Chart
89
+ bar_fig = go.Figure(data=[go.Bar(x=df_scores.index, y=df_scores[score_type])])
90
+ bar_fig.update_layout(
91
+ title=f"{score_name} Bar Chart for {metric.upper()}",
92
+ xaxis_title="Row Index",
93
+ yaxis_title=score_name,
94
+ )
95
+ figures.append(bar_fig)
96
+
97
+ # Calculate statistics for each score type
98
+ stats_df = df_scores.describe().loc[["mean", "50%", "max", "min", "std"]]
99
+ stats_df = stats_df.rename(
100
+ index={
101
+ "mean": "Mean Score",
102
+ "50%": "Median Score",
103
+ "max": "Max Score",
104
+ "min": "Min Score",
105
+ "std": "Standard Deviation",
106
+ }
107
+ ).T
108
+ stats_df["Count"] = len(df_scores)
109
+
110
+ # Rename metrics for clarity
111
+ stats_df.index = stats_df.index.map(
112
+ {"p": "Precision", "r": "Recall", "f": "F1 Score"}
113
+ )
114
+
115
+ # Create a DataFrame from all collected statistics
116
+ result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
117
+
118
+ return (result_df, *tuple(figures))