validmind 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai.py +3 -3
  3. validmind/api_client.py +2 -3
  4. validmind/client.py +68 -25
  5. validmind/datasets/llm/rag/__init__.py +11 -0
  6. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
  7. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
  8. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
  9. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
  10. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
  11. validmind/datasets/llm/rag/rfp.py +41 -0
  12. validmind/html_templates/__init__.py +0 -0
  13. validmind/html_templates/content_blocks.py +89 -14
  14. validmind/models/__init__.py +7 -4
  15. validmind/models/foundation.py +8 -34
  16. validmind/models/function.py +51 -0
  17. validmind/models/huggingface.py +16 -46
  18. validmind/models/metadata.py +42 -0
  19. validmind/models/pipeline.py +66 -0
  20. validmind/models/pytorch.py +8 -42
  21. validmind/models/r_model.py +33 -82
  22. validmind/models/sklearn.py +39 -38
  23. validmind/template.py +8 -26
  24. validmind/tests/__init__.py +43 -20
  25. validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
  26. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
  27. validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
  28. validmind/tests/data_validation/Duplicates.py +1 -1
  29. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  30. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
  31. validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
  32. validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
  33. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
  34. validmind/tests/data_validation/nlp/Punctuations.py +11 -12
  35. validmind/tests/data_validation/nlp/Sentiment.py +57 -0
  36. validmind/tests/data_validation/nlp/Toxicity.py +45 -0
  37. validmind/tests/decorator.py +2 -2
  38. validmind/tests/model_validation/BertScore.py +100 -98
  39. validmind/tests/model_validation/BleuScore.py +93 -64
  40. validmind/tests/model_validation/ContextualRecall.py +74 -91
  41. validmind/tests/model_validation/MeteorScore.py +86 -74
  42. validmind/tests/model_validation/RegardScore.py +103 -121
  43. validmind/tests/model_validation/RougeScore.py +118 -0
  44. validmind/tests/model_validation/TokenDisparity.py +84 -121
  45. validmind/tests/model_validation/ToxicityScore.py +109 -123
  46. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
  47. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
  48. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
  49. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
  50. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
  51. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
  52. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
  53. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
  54. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
  55. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
  56. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
  57. validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
  58. validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
  59. validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
  60. validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
  61. validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
  62. validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
  63. validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
  64. validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
  65. validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
  66. validmind/tests/model_validation/ragas/utils.py +66 -0
  67. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
  68. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
  69. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
  70. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
  71. validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
  72. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  73. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
  74. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +14 -12
  75. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
  76. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
  77. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
  78. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  79. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  80. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  81. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
  82. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  83. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
  84. validmind/unit_metrics/__init__.py +26 -49
  85. validmind/unit_metrics/composite.py +5 -1
  86. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
  87. validmind/utils.py +56 -6
  88. validmind/vm_models/__init__.py +1 -1
  89. validmind/vm_models/dataset/__init__.py +7 -0
  90. validmind/vm_models/dataset/dataset.py +558 -0
  91. validmind/vm_models/dataset/utils.py +146 -0
  92. validmind/vm_models/model.py +97 -72
  93. validmind/vm_models/test/result_wrapper.py +61 -24
  94. validmind/vm_models/test_context.py +1 -1
  95. validmind/vm_models/test_suite/summary.py +3 -4
  96. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/METADATA +5 -3
  97. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/RECORD +100 -75
  98. validmind/models/catboost.py +0 -33
  99. validmind/models/statsmodels.py +0 -50
  100. validmind/models/xgboost.py +0 -30
  101. validmind/tests/model_validation/BertScoreAggregate.py +0 -90
  102. validmind/tests/model_validation/RegardHistogram.py +0 -148
  103. validmind/tests/model_validation/RougeMetrics.py +0 -147
  104. validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
  105. validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
  106. validmind/tests/model_validation/ToxicityHistogram.py +0 -136
  107. validmind/vm_models/dataset.py +0 -1303
  108. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/LICENSE +0 -0
  109. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/WHEEL +0 -0
  110. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/entry_points.txt +0 -0
@@ -2,109 +2,92 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import itertools
6
- from dataclasses import dataclass
7
-
8
5
  import nltk
9
6
  import pandas as pd
10
7
  import plotly.graph_objects as go
11
8
 
12
- from validmind.vm_models import Figure, Metric
9
+ from validmind import tags, tasks
13
10
 
14
11
 
15
- @dataclass
16
- class ContextualRecall(Metric):
12
+ @tags("nlp", "text_data", "visualization")
13
+ @tasks("text_classification", "text_summarization")
14
+ def ContextualRecall(dataset, model):
17
15
  """
18
- Evaluates a Natural Language Generation model's ability to generate contextually relevant and factually correct
19
- text.
20
-
21
- **Purpose**:
22
- The Contextual Recall metric is used to evaluate the ability of a natural language generation (NLG) model to
23
- generate text that appropriately reflects the given context or prompt. It measures the model's capability to
24
- remember and reproduce the main context in its resulting output. This metric is critical in natural language
25
- processing tasks, as the coherency and contextuality of the generated text are essential.
26
-
27
- **Test Mechanism**:
28
-
29
- 1. **Preparation of Reference and Candidate Texts**:
30
- - **Reference Texts**: Gather the reference text(s) which exemplify the expected or ideal output for a specific
31
- context or prompt.
32
- - **Candidate Texts**: Generate candidate text(s) from the NLG model under evaluation using the same context.
33
- 2. **Tokenization and Preprocessing**:
34
- - Tokenize the reference and candidate texts into discernible words or tokens using libraries such as NLTK.
35
- 3. **Computation of Contextual Recall**:
36
- - Identify the token overlap between the reference and candidate texts.
37
- - The Contextual Recall score is computed by dividing the number of overlapping tokens by the total number of
38
- tokens in the reference text. Scores are calculated for each test dataset instance, resulting in an array of
39
- scores. These scores are then visualized using a line plot to show score variations across different rows.
40
-
41
- **Signs of High Risk**:
42
-
43
- - Low contextual recall scores could indicate that the model is not effectively reflecting the original context in
44
- its output, leading to incoherent or contextually misaligned text.
45
- - A consistent trend of low recall scores could suggest underperformance of the model.
16
+ Evaluates a Natural Language Generation model's ability to generate contextually relevant and factually correct text, visualizing the results through histograms and bar charts, alongside compiling a comprehensive table of descriptive statistics for contextual recall scores.
46
17
 
47
- **Strengths**:
18
+ **Purpose:**
19
+ The Contextual Recall metric is used to evaluate the ability of a natural language generation (NLG) model to generate text that appropriately reflects the given context or prompt. It measures the model's capability to remember and reproduce the main context in its resulting output. This metric is critical in natural language processing tasks, as the coherency and contextuality of the generated text are essential.
48
20
 
49
- - The Contextual Recall metric provides a quantifiable measure of a model's adherence to the context and factual
50
- elements of the generated narrative.
51
- - This metric finds particular value in applications requiring deep comprehension of context, such as text
52
- continuation or interactive dialogue systems.
53
- - The line plot visualization provides a clear and intuitive representation of score fluctuations.
21
+ **Test Mechanism:**
22
+ The function starts by extracting the true and predicted values from the provided dataset and model. It then tokenizes the reference and candidate texts into discernible words or tokens using NLTK. The token overlap between the reference and candidate texts is identified, and the Contextual Recall score is computed by dividing the number of overlapping tokens by the total number of tokens in the reference text. Scores are calculated for each test dataset instance, resulting in an array of scores. These scores are visualized using a histogram and a bar chart to show score variations across different rows. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the contextual recall scores, providing a comprehensive summary of the model's performance.
23
+
24
+ **Signs of High Risk:**
25
+ - Low contextual recall scores could indicate that the model is not effectively reflecting the original context in its output, leading to incoherent or contextually misaligned text.
26
+ - A consistent trend of low recall scores could suggest underperformance of the model.
54
27
 
55
- **Limitations**:
28
+ **Strengths:**
29
+ - Provides a quantifiable measure of a model's adherence to the context and factual elements of the generated narrative.
30
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of contextual recall scores.
31
+ - Descriptive statistics offer a concise summary of the model's performance in generating contextually relevant texts.
56
32
 
57
- - Despite its effectiveness, the Contextual Recall could fail to comprehensively assess the performance of NLG
58
- models. Its focus on word overlap could result in high scores for texts that use many common words, even when these
59
- texts lack coherence or meaningful context.
33
+ **Limitations:**
34
+ - The focus on word overlap could result in high scores for texts that use many common words, even when these texts lack coherence or meaningful context.
60
35
  - This metric does not consider the order of words, which could lead to overestimated scores for scrambled outputs.
61
36
  - Models that effectively use infrequent words might be undervalued, as these words might not overlap as often.
62
37
  """
63
38
 
64
- name = "contextual_recall"
65
- required_inputs = ["model", "dataset"]
66
-
67
- def run(self):
68
- y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
69
- y_pred = self.inputs.dataset.y_pred(self.inputs.model)
70
-
71
- score_list = []
72
- for y_t, y_p in zip(y_true, y_pred):
73
- # Tokenize the reference and candidate texts
74
- reference_tokens = nltk.word_tokenize(y_t.lower())
75
- candidate_tokens = nltk.word_tokenize(y_p.lower())
76
-
77
- # Calculate overlapping tokens
78
- overlapping_tokens = set(reference_tokens) & set(candidate_tokens)
79
-
80
- # Compute contextual recall
81
- score_list.append(len(overlapping_tokens) / len(reference_tokens))
82
-
83
- metrics_df = pd.DataFrame(score_list, columns=["Contextual Recall"])
84
- figures = []
85
- # Visualization part
86
- fig = go.Figure()
87
-
88
- # Adding the line plots
89
- fig.add_trace(
90
- go.Scatter(
91
- x=metrics_df.index,
92
- y=metrics_df["Contextual Recall"],
93
- mode="lines+markers",
94
- name="Contextual Recall",
95
- )
96
- )
97
- fig.update_layout(
98
- title="Contextual Recall scores for each row",
99
- xaxis_title="Row Index",
100
- yaxis_title="Score",
101
- )
102
- figures.append(
103
- Figure(
104
- for_object=self,
105
- key=self.key,
106
- figure=fig,
107
- )
108
- )
109
-
110
- return self.cache_results(figures=figures)
39
+ y_true = dataset.y
40
+ y_pred = dataset.y_pred(model)
41
+
42
+ score_list = []
43
+ for y_t, y_p in zip(y_true, y_pred):
44
+ # Tokenize the reference and candidate texts
45
+ reference_tokens = nltk.word_tokenize(y_t.lower())
46
+ candidate_tokens = nltk.word_tokenize(y_p.lower())
47
+
48
+ # Calculate overlapping tokens
49
+ overlapping_tokens = set(reference_tokens) & set(candidate_tokens)
50
+
51
+ # Compute contextual recall
52
+ score_list.append(len(overlapping_tokens) / len(reference_tokens))
53
+
54
+ metrics_df = pd.DataFrame(score_list, columns=["Contextual Recall"])
55
+ figures = []
56
+
57
+ # Histogram for Contextual Recall
58
+ hist_fig = go.Figure(data=[go.Histogram(x=metrics_df["Contextual Recall"])])
59
+ hist_fig.update_layout(
60
+ title="Contextual Recall Histogram",
61
+ xaxis_title="Contextual Recall",
62
+ yaxis_title="Count",
63
+ )
64
+ figures.append(hist_fig)
65
+
66
+ # Bar Chart for Contextual Recall
67
+ bar_fig = go.Figure(
68
+ data=[go.Bar(x=metrics_df.index, y=metrics_df["Contextual Recall"])]
69
+ )
70
+ bar_fig.update_layout(
71
+ title="Contextual Recall Bar Chart",
72
+ xaxis_title="Row Index",
73
+ yaxis_title="Contextual Recall",
74
+ )
75
+ figures.append(bar_fig)
76
+
77
+ # Calculate statistics for Contextual Recall
78
+ stats_df = metrics_df.describe().loc[["mean", "50%", "max", "min", "std"]]
79
+ stats_df = stats_df.rename(
80
+ index={
81
+ "mean": "Mean Score",
82
+ "50%": "Median Score",
83
+ "max": "Max Score",
84
+ "min": "Min Score",
85
+ "std": "Standard Deviation",
86
+ }
87
+ ).T
88
+ stats_df["Count"] = len(metrics_df)
89
+
90
+ # Create a DataFrame from all collected statistics
91
+ result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
92
+
93
+ return (result_df, *tuple(figures))
@@ -2,91 +2,103 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import evaluate
8
6
  import pandas as pd
9
7
  import plotly.graph_objects as go
10
8
 
11
- from validmind.vm_models import Figure, Metric
9
+ from validmind import tags, tasks
12
10
 
13
11
 
14
- @dataclass
15
- class MeteorScore(Metric):
12
+ @tags("nlp", "text_data", "visualization")
13
+ @tasks("text_classification", "text_summarization")
14
+ def MeteorScore(dataset, model):
16
15
  """
17
16
  Computes and visualizes the METEOR score for each text generation instance, assessing translation quality.
18
17
 
19
- **Purpose**: METEOR (Metric for Evaluation of Translation with Explicit ORdering) is designed to evaluate the quality
20
- of machine translations by comparing them against reference translations. It emphasizes both the accuracy and fluency
21
- of translations, incorporating precision, recall, and word order into its assessment.
22
-
23
- **Test Mechanism**: The METEOR score is computed for each pair of machine-generated translation (prediction) and its
24
- corresponding human-produced reference. This is done by considering unigram matches between the translations, including
25
- matches based on surface forms, stemmed forms, and synonyms. The score is a combination of unigram precision and recall,
26
- adjusted for word order through a fragmentation penalty.
27
-
28
- **Signs of High Risk**:
29
- - Lower METEOR scores can indicate a lack of alignment between the machine-generated translations and their human-produced references, highlighting potential deficiencies in both the accuracy and fluency of translations.
30
- - Significant discrepancies in word order or an excessive fragmentation penalty could signal issues with how the translation model processes and reconstructs sentence structures, potentially compromising the natural flow of translated text.
31
- - Persistent underperformance across a variety of text types or linguistic contexts might suggest a broader inability of the model to adapt to the nuances of different languages or dialects, pointing towards gaps in its training or inherent limitations.
32
-
33
- **Strengths**:
34
- - Incorporates a balanced consideration of precision and recall, weighted towards recall to reflect the importance of
35
- content coverage in translations.
18
+ **Purpose:**
19
+ METEOR (Metric for Evaluation of Translation with Explicit ORdering) is designed to evaluate the quality of machine translations
20
+ by comparing them against reference translations. It emphasizes both the accuracy and fluency of translations, incorporating
21
+ precision, recall, and word order into its assessment.
22
+
23
+ **Test Mechanism:**
24
+ The function starts by extracting the true and predicted values from the provided dataset and model. The METEOR score is computed
25
+ for each pair of machine-generated translation (prediction) and its corresponding human-produced reference. This is done by
26
+ considering unigram matches between the translations, including matches based on surface forms, stemmed forms, and synonyms.
27
+ The score is a combination of unigram precision and recall, adjusted for word order through a fragmentation penalty. Scores are
28
+ compiled into a dataframe, and histograms and bar charts are generated to visualize the distribution of METEOR scores. Additionally,
29
+ a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the METEOR scores,
30
+ providing a comprehensive summary of the model's performance.
31
+
32
+ **Signs of High Risk:**
33
+ - Lower METEOR scores can indicate a lack of alignment between the machine-generated translations and their human-produced references,
34
+ highlighting potential deficiencies in both the accuracy and fluency of translations.
35
+ - Significant discrepancies in word order or an excessive fragmentation penalty could signal issues with how the translation model processes
36
+ and reconstructs sentence structures, potentially compromising the natural flow of translated text.
37
+ - Persistent underperformance across a variety of text types or linguistic contexts might suggest a broader inability of the model to adapt to the
38
+ nuances of different languages or dialects, pointing towards gaps in its training or inherent limitations.
39
+
40
+ **Strengths:**
41
+ - Incorporates a balanced consideration of precision and recall, weighted towards recall to reflect the importance of content coverage in translations.
36
42
  - Directly accounts for word order, offering a nuanced evaluation of translation fluency beyond simple lexical matching.
37
43
  - Adapts to various forms of lexical similarity, including synonyms and stemmed forms, allowing for flexible matching.
38
44
 
39
- **Limitations**:
40
- - While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for
41
- large datasets.
42
- - The use of external resources for synonym and stemming matching may introduce variability based on the resources'
43
- quality and relevance to the specific translation task.
45
+ **Limitations:**
46
+ - While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for large datasets.
47
+ - The use of external resources for synonym and stemming matching may introduce variability based on the resources' quality and relevance to the specific
48
+ translation task.
44
49
  """
45
50
 
46
- name = "meteor_score"
47
- required_inputs = ["model", "dataset"]
48
-
49
- def run(self):
50
- # Load the METEOR metric
51
- meteor = evaluate.load("meteor")
52
-
53
- # Initialize a list to hold METEOR scores
54
- meteor_scores = []
55
-
56
- for prediction, reference in zip(
57
- self.inputs.dataset.y_pred(self.inputs.model),
58
- self.inputs.dataset.y,
59
- ):
60
- # Compute the METEOR score for the current prediction-reference pair
61
- result = meteor.compute(predictions=[prediction], references=[reference])
62
- meteor_scores.append(result["meteor"])
63
-
64
- # Visualization of METEOR scores
65
- figures = self.visualize_scores(meteor_scores)
66
-
67
- return self.cache_results(figures=figures)
68
-
69
- def visualize_scores(self, scores):
70
- # Convert the scores list to a DataFrame for plotting
71
- scores_df = pd.DataFrame(scores, columns=["METEOR Score"])
72
-
73
- # Create a line plot of the METEOR scores
74
- fig = go.Figure()
75
- fig.add_trace(
76
- go.Scatter(
77
- x=scores_df.index,
78
- y=scores_df["METEOR Score"],
79
- mode="lines+markers",
80
- name="METEOR Score",
81
- )
82
- )
83
- fig.update_layout(
84
- title="METEOR Scores Across Text Instances",
85
- xaxis_title="Text Instance Index",
86
- yaxis_title="METEOR Score",
87
- )
88
-
89
- # Wrap the Plotly figure for compatibility with your framework
90
- figures = [Figure(for_object=self, key=self.key, figure=fig)]
91
-
92
- return figures
51
+ # Extract true and predicted values
52
+ y_true = dataset.y
53
+ y_pred = dataset.y_pred(model)
54
+
55
+ # Load the METEOR evaluation metric
56
+ meteor = evaluate.load("meteor")
57
+
58
+ # Calculate METEOR scores
59
+ score_list = []
60
+ for y_t, y_p in zip(y_true, y_pred):
61
+ # Compute the METEOR score
62
+ score = meteor.compute(predictions=[y_p], references=[y_t])
63
+ score_list.append(score["meteor"])
64
+
65
+ # Convert scores to a dataframe
66
+ metrics_df = pd.DataFrame(score_list, columns=["METEOR Score"])
67
+
68
+ figures = []
69
+
70
+ # Histogram for METEOR Score
71
+ hist_fig = go.Figure(data=[go.Histogram(x=metrics_df["METEOR Score"])])
72
+ hist_fig.update_layout(
73
+ title="METEOR Score Histogram",
74
+ xaxis_title="METEOR Score",
75
+ yaxis_title="Count",
76
+ )
77
+ figures.append(hist_fig)
78
+
79
+ # Bar Chart for METEOR Score
80
+ bar_fig = go.Figure(data=[go.Bar(x=metrics_df.index, y=metrics_df["METEOR Score"])])
81
+ bar_fig.update_layout(
82
+ title="METEOR Score Bar Chart",
83
+ xaxis_title="Row Index",
84
+ yaxis_title="METEOR Score",
85
+ )
86
+ figures.append(bar_fig)
87
+
88
+ # Calculate statistics for METEOR Score
89
+ stats_df = metrics_df.describe().loc[["mean", "50%", "max", "min", "std"]]
90
+ stats_df = stats_df.rename(
91
+ index={
92
+ "mean": "Mean Score",
93
+ "50%": "Median Score",
94
+ "max": "Max Score",
95
+ "min": "Min Score",
96
+ "std": "Standard Deviation",
97
+ }
98
+ ).T
99
+ stats_df["Count"] = len(metrics_df)
100
+
101
+ # Create a DataFrame from all collected statistics
102
+ result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
103
+
104
+ return (result_df, *tuple(figures))
@@ -2,142 +2,124 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import itertools
6
- from dataclasses import dataclass
7
-
8
5
  import evaluate
6
+ import pandas as pd
9
7
  import plotly.graph_objects as go
10
- from plotly.subplots import make_subplots
11
8
 
12
- from validmind.vm_models import Figure, Metric
9
+ from validmind import tags, tasks
13
10
 
14
11
 
15
- @dataclass
16
- class RegardScore(Metric):
12
+ @tags("nlp", "text_data", "visualization")
13
+ @tasks("text_classification", "text_summarization")
14
+ def RegardScore(dataset, model):
17
15
  """
16
+ Computes and visualizes the regard score for each text instance, assessing sentiment and potential biases.
17
+
18
18
  **Purpose:**
19
- The `RegardScore` metric assesses the degree of regard—positive, negative, neutral, or other—present in the given text,
20
- whether it's a classification or summarization result. Especially crucial for applications like sentiment analysis,
21
- product reviews, or opinion mining, it provides a granular understanding of how the model perceives or generates content
22
- in terms of favorability or sentiment.
19
+ The `RegardScore` metric is designed to evaluate the regard levels (positive, negative, neutral, or other) of texts generated by models. This helps in understanding the sentiment and biases in the generated content.
23
20
 
24
21
  **Test Mechanism:**
25
- The metric ingests data primarily from the model's test dataset, extracting the input text, target text (true regard),
26
- and the model's predicted regard. These elements undergo a series of consistency checks before being processed. Using
27
- the `evaluate.load("regard")` tool, regard scores are computed for each segment of text. The results are then visualized
28
- in a multi-subplot line graph, where each subplot corresponds to a particular category of regard (e.g., positive, negative,
29
- neutral, other) against the input, target, and predicted texts.
22
+ The function starts by extracting the true and predicted values from the provided dataset and model. The regard scores are computed for each text using a preloaded `regard` evaluation tool. The scores are compiled into dataframes, and histograms and bar charts are generated to visualize the distribution of regard scores. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the regard scores, providing a comprehensive summary of the model's performance.
30
23
 
31
24
  **Signs of High Risk:**
32
- Disparities between the target regard scores and the predicted regard scores may signify potential flaws or biases in
33
- the model. For instance, if neutral inputs are consistently perceived as strongly positive or negative, this could
34
- indicate the model's inability to correctly identify or generate balanced sentiments.
25
+ - Noticeable skewness in the histogram, especially when comparing the predicted regard scores with the target regard scores, could indicate biases or inconsistencies in the model.
26
+ - Lack of neutral scores in the model's predictions, despite a balanced distribution in the target data, might signal an issue.
35
27
 
36
28
  **Strengths:**
37
- The metric's visual presentation, using line plots, provides an intuitive way to comprehend the model's regard assessment
38
- across different text samples and regard categories. The color-coded lines associated with each regard category further
39
- enhance the clarity of the comparison, making it simpler for stakeholders or researchers to infer the model's performance.
29
+ - Provides a clear evaluation of regard levels in generated texts, helping to ensure content appropriateness.
30
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of regard scores.
31
+ - Descriptive statistics offer a concise summary of the model's performance in generating texts with balanced sentiments.
40
32
 
41
33
  **Limitations:**
42
- The `RegardScoreHistogram` metric emphasizes regard scores but may not always grasp intricate nuances or the true context
43
- of texts. Its reliance on underlying tools, which might be trained on potentially biased datasets, can result in skewed
44
- interpretations. Additionally, while the metric segments regard into discrete categories such as "positive" and "negative,"
45
- real-world sentiments often exist on a more complex spectrum. The metric's efficacy is intertwined with the accuracy of
46
- the model's predictions; any inherent model inaccuracies can impact the metric's reflection of true sentiments.
34
+ - The accuracy of the regard scores is contingent upon the underlying `regard` tool.
35
+ - The scores provide a broad overview but do not specify which portions or tokens of the text are responsible for high regard.
36
+ - Supplementary, in-depth analysis might be needed for granular insights.
47
37
  """
48
38
 
49
- name = "regard_score"
50
- required_inputs = ["model", "dataset"]
51
- metadata = {
52
- "task_types": ["text_classification", "text_summarization"],
53
- "tags": ["regard_score"],
54
- }
55
-
56
- def _get_datasets(self):
57
- if not hasattr(self, "model"):
58
- raise AttributeError("The 'model' attribute is missing.")
59
-
60
- y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
61
- y_pred = self.inputs.dataset.y_pred(self.inputs.model)
62
-
63
- if not len(y_true) == len(y_pred):
64
- raise ValueError(
65
- "Inconsistent lengths among input text, true summaries, and predicted summaries."
39
+ # Extract true and predicted values
40
+ y_true = dataset.y
41
+ y_pred = dataset.y_pred(model)
42
+
43
+ # Load the regard evaluation metric
44
+ regard_tool = evaluate.load("regard")
45
+
46
+ # Function to calculate regard scores
47
+ def compute_regard_scores(texts):
48
+ scores = regard_tool.compute(data=texts)["regard"]
49
+ regard_dicts = [
50
+ dict((x["label"], x["score"]) for x in sublist) for sublist in scores
51
+ ]
52
+ return regard_dicts
53
+
54
+ # Calculate regard scores for true and predicted texts
55
+ true_regard = compute_regard_scores(y_true)
56
+ pred_regard = compute_regard_scores(y_pred)
57
+
58
+ # Convert scores to dataframes
59
+ true_df = pd.DataFrame(true_regard)
60
+ pred_df = pd.DataFrame(pred_regard)
61
+
62
+ figures = []
63
+
64
+ # Function to create histogram and bar chart for regard scores
65
+ def create_figures(df, title):
66
+ for category in df.columns:
67
+ # Histogram
68
+ hist_fig = go.Figure(data=[go.Histogram(x=df[category])])
69
+ hist_fig.update_layout(
70
+ title=f"{title} - {category.capitalize()} Histogram",
71
+ xaxis_title=category.capitalize(),
72
+ yaxis_title="Count",
66
73
  )
67
-
68
- return y_true, y_pred
69
-
70
- def regard_line_plot(self):
71
- regard_tool = evaluate.load("regard")
72
- y_true, y_pred = self._get_datasets()
73
-
74
- dataframes = {
75
- "Target Text": y_true,
76
- "Predicted Summaries": y_pred,
77
- }
78
-
79
- total_text_columns = len(dataframes)
80
- total_rows = total_text_columns * 2
81
-
82
- categories_order = ["positive", "negative", "neutral", "other"]
83
- category_colors = {
84
- "negative": "#d9534f",
85
- "neutral": "#5bc0de",
86
- "other": "#f0ad4e",
87
- "positive": "#5cb85c",
88
- }
89
-
90
- fig = make_subplots(
91
- rows=total_rows,
92
- cols=2,
93
- subplot_titles=[
94
- f"{col_name} {cat}"
95
- for col_name in dataframes
96
- for cat in categories_order
97
- ],
98
- shared_yaxes=True,
99
- vertical_spacing=0.1,
100
- )
101
-
102
- row_offset = 0
103
- for column_name, column_data in dataframes.items():
104
- results = regard_tool.compute(data=column_data)["regard"]
105
- regard_dicts = [
106
- dict((x["label"], x["score"]) for x in sublist) for sublist in results
107
- ]
108
-
109
- for idx, category in enumerate(categories_order, start=1):
110
- row, col = ((idx - 1) // 2 + 1 + row_offset, (idx - 1) % 2 + 1)
111
- scores = [res_dict[category] for res_dict in regard_dicts]
112
- fig.add_trace(
113
- go.Scatter(
114
- name=f"{category} ({column_name})",
115
- x=list(range(len(column_data))),
116
- y=scores,
117
- mode="lines+markers",
118
- marker=dict(size=5),
119
- hoverinfo="y+name",
120
- line=dict(color=category_colors[category], width=1.5),
121
- showlegend=False,
122
- ),
123
- row=row,
124
- col=col,
125
- )
126
- row_offset += 2
127
-
128
- subplot_height = 350
129
- total_height = total_rows * subplot_height + 200
130
-
131
- fig.update_layout(title_text="Regard Scores", height=total_height)
132
- fig.update_yaxes(range=[0, 1])
133
- fig.update_xaxes(showticklabels=False, row=1, col=1)
134
- fig.update_xaxes(title_text="Index", showticklabels=True, row=1, col=1)
135
- fig.update_yaxes(title_text="Score", showticklabels=True, row=1, col=1)
136
-
137
- return fig
138
-
139
- def run(self):
140
- fig = self.regard_line_plot()
141
- return self.cache_results(
142
- figures=[Figure(for_object=self, key=self.key, figure=fig)]
143
- )
74
+ figures.append(hist_fig)
75
+
76
+ # Bar Chart
77
+ bar_fig = go.Figure(data=[go.Bar(x=df.index, y=df[category])])
78
+ bar_fig.update_layout(
79
+ title=f"{title} - {category.capitalize()} Bar Chart",
80
+ xaxis_title="Text Instance Index",
81
+ yaxis_title=category.capitalize(),
82
+ )
83
+ figures.append(bar_fig)
84
+
85
+ # Create figures for each regard score dataframe
86
+ create_figures(true_df, "True Text Regard")
87
+ create_figures(pred_df, "Predicted Text Regard")
88
+
89
+ # Calculate statistics for each regard score dataframe
90
+ def calculate_stats(df, metric_name):
91
+ stats = df.describe().loc[["mean", "50%", "max", "min", "std"]].T
92
+ stats.columns = [
93
+ "Mean Score",
94
+ "Median Score",
95
+ "Max Score",
96
+ "Min Score",
97
+ "Standard Deviation",
98
+ ]
99
+ stats["Metric"] = metric_name
100
+ stats["Count"] = len(df)
101
+ return stats
102
+
103
+ true_stats = calculate_stats(true_df, "True Text Regard")
104
+ pred_stats = calculate_stats(pred_df, "Predicted Text Regard")
105
+
106
+ # Combine statistics into a single dataframe
107
+ result_df = (
108
+ pd.concat([true_stats, pred_stats])
109
+ .reset_index()
110
+ .rename(columns={"index": "Category"})
111
+ )
112
+ result_df = result_df[
113
+ [
114
+ "Metric",
115
+ "Category",
116
+ "Mean Score",
117
+ "Median Score",
118
+ "Max Score",
119
+ "Min Score",
120
+ "Standard Deviation",
121
+ "Count",
122
+ ]
123
+ ]
124
+
125
+ return (result_df, *tuple(figures))