validmind 2.1.1__py3-none-any.whl → 2.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai.py +3 -3
  3. validmind/api_client.py +2 -3
  4. validmind/client.py +68 -25
  5. validmind/datasets/llm/rag/__init__.py +11 -0
  6. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
  7. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
  8. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
  9. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
  10. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
  11. validmind/datasets/llm/rag/rfp.py +41 -0
  12. validmind/html_templates/__init__.py +0 -0
  13. validmind/html_templates/content_blocks.py +89 -14
  14. validmind/models/__init__.py +7 -4
  15. validmind/models/foundation.py +8 -34
  16. validmind/models/function.py +51 -0
  17. validmind/models/huggingface.py +16 -46
  18. validmind/models/metadata.py +42 -0
  19. validmind/models/pipeline.py +66 -0
  20. validmind/models/pytorch.py +8 -42
  21. validmind/models/r_model.py +33 -82
  22. validmind/models/sklearn.py +39 -38
  23. validmind/template.py +8 -26
  24. validmind/tests/__init__.py +43 -20
  25. validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
  26. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
  27. validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
  28. validmind/tests/data_validation/Duplicates.py +1 -1
  29. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  30. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
  31. validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
  32. validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
  33. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
  34. validmind/tests/data_validation/nlp/Punctuations.py +11 -12
  35. validmind/tests/data_validation/nlp/Sentiment.py +57 -0
  36. validmind/tests/data_validation/nlp/Toxicity.py +45 -0
  37. validmind/tests/decorator.py +2 -2
  38. validmind/tests/model_validation/BertScore.py +100 -98
  39. validmind/tests/model_validation/BleuScore.py +93 -64
  40. validmind/tests/model_validation/ContextualRecall.py +74 -91
  41. validmind/tests/model_validation/MeteorScore.py +86 -74
  42. validmind/tests/model_validation/RegardScore.py +103 -121
  43. validmind/tests/model_validation/RougeScore.py +118 -0
  44. validmind/tests/model_validation/TokenDisparity.py +84 -121
  45. validmind/tests/model_validation/ToxicityScore.py +109 -123
  46. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
  47. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
  48. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
  49. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
  50. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
  51. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
  52. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
  53. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
  54. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
  55. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
  56. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
  57. validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
  58. validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
  59. validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
  60. validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
  61. validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
  62. validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
  63. validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
  64. validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
  65. validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
  66. validmind/tests/model_validation/ragas/utils.py +66 -0
  67. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
  68. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
  69. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
  70. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
  71. validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
  72. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  73. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
  74. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -11
  75. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
  76. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
  77. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
  78. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  79. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  80. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  81. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
  82. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  83. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
  84. validmind/unit_metrics/__init__.py +26 -49
  85. validmind/unit_metrics/composite.py +5 -1
  86. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
  87. validmind/utils.py +56 -6
  88. validmind/vm_models/__init__.py +1 -1
  89. validmind/vm_models/dataset/__init__.py +7 -0
  90. validmind/vm_models/dataset/dataset.py +558 -0
  91. validmind/vm_models/dataset/utils.py +146 -0
  92. validmind/vm_models/model.py +97 -72
  93. validmind/vm_models/test/result_wrapper.py +61 -24
  94. validmind/vm_models/test_context.py +1 -1
  95. validmind/vm_models/test_suite/summary.py +3 -4
  96. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/METADATA +5 -3
  97. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/RECORD +100 -75
  98. validmind/models/catboost.py +0 -33
  99. validmind/models/statsmodels.py +0 -50
  100. validmind/models/xgboost.py +0 -30
  101. validmind/tests/model_validation/BertScoreAggregate.py +0 -90
  102. validmind/tests/model_validation/RegardHistogram.py +0 -148
  103. validmind/tests/model_validation/RougeMetrics.py +0 -147
  104. validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
  105. validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
  106. validmind/tests/model_validation/ToxicityHistogram.py +0 -136
  107. validmind/vm_models/dataset.py +0 -1303
  108. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/LICENSE +0 -0
  109. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/WHEEL +0 -0
  110. {validmind-2.1.1.dist-info → validmind-2.2.2.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,48 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+
6
+ import pandas as pd
7
+ import plotly.express as px
8
+ from textblob import TextBlob
9
+
10
+ from validmind import tags, tasks
11
+
12
+
13
+ @tags("data_validation")
14
+ @tasks("nlp")
15
+ def PolarityAndSubjectivity(dataset):
16
+ """
17
+ Analyzes the polarity and subjectivity of text data within a dataset.
18
+
19
+ This method processes a dataset containing textual data to compute the polarity and
20
+ subjectivity scores using TextBlob, and returns a Plotly scatter plot visualizing
21
+ these scores.
22
+
23
+ Args:
24
+ dataset (Dataset): A dataset object which must have a `df` attribute (a pandas DataFrame)
25
+ and a `text_column` attribute indicating the name of the column containing text.
26
+
27
+ Returns:
28
+ plotly.graph_objs._figure.Figure: A Plotly scatter plot of polarity vs subjectivity.
29
+ """
30
+ # Function to calculate sentiment and subjectivity
31
+ def analyze_sentiment(text):
32
+ analysis = TextBlob(text)
33
+ return analysis.sentiment.polarity, analysis.sentiment.subjectivity
34
+
35
+ data = pd.DataFrame()
36
+ # Apply the function to each row
37
+ data[["polarity", "subjectivity"]] = dataset.df[dataset.text_column].apply(
38
+ lambda x: pd.Series(analyze_sentiment(x))
39
+ )
40
+
41
+ # Create a Plotly scatter plot
42
+ fig = px.scatter(
43
+ data, x="polarity", y="subjectivity", title="Polarity vs Subjectivity"
44
+ )
45
+ fig.update_traces(textposition="top center")
46
+ fig.update_layout(xaxis_title="Polarity", yaxis_title="Subjectivity")
47
+
48
+ return fig
@@ -72,25 +72,24 @@ class Punctuations(Metric):
72
72
  text_column = self.inputs.dataset.text_column
73
73
  corpus = create_corpus(self.inputs.dataset.df, text_column=text_column)
74
74
 
75
- dic = defaultdict(int)
76
75
  special = string.punctuation
76
+ dic = defaultdict(int, {key: 0 for key in special})
77
77
  for i in corpus:
78
78
  if i in special:
79
79
  dic[i] += 1
80
-
80
+ figures = []
81
+ # if dic:
81
82
  fig = plt.figure()
82
83
  x, y = zip(*dic.items())
83
84
  plt.bar(x, y, color="#17C37B")
84
-
85
+ figures.append(
86
+ Figure(
87
+ for_object=self,
88
+ key=self.key,
89
+ figure=fig,
90
+ )
91
+ )
85
92
  # Do this if you want to prevent the figure from being displayed
86
93
  plt.close("all")
87
94
 
88
- return self.cache_results(
89
- figures=[
90
- Figure(
91
- for_object=self,
92
- key=self.key,
93
- figure=fig,
94
- )
95
- ]
96
- )
95
+ return self.cache_results(figures=figures)
@@ -0,0 +1,57 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+
6
+ import matplotlib.pyplot as plt
7
+ import nltk
8
+ import seaborn as sns
9
+ from nltk.sentiment import SentimentIntensityAnalyzer
10
+
11
+ from validmind import tags, tasks
12
+
13
+
14
+ @tags("data_validation")
15
+ @tasks("nlp")
16
+ def Sentiment(dataset):
17
+ """
18
+ Analyzes the sentiment of text data within a dataset using the VADER sentiment analysis tool.
19
+
20
+ This method initializes the VADER SentimentIntensityAnalyzer and applies it to each text entry
21
+ in the specified column of the dataset's dataframe. It returns a KDE plot visualizing the distribution
22
+ of sentiment scores across the dataset.
23
+
24
+ Args:
25
+ dataset (Dataset): A dataset object which must have a `df` attribute (a pandas DataFrame)
26
+ and a `text_column` attribute indicating the name of the column containing text.
27
+
28
+ Returns:
29
+ matplotlib.figure.Figure: A KDE plot visualizing the distribution of sentiment scores.
30
+ """
31
+ nltk.download("vader_lexicon", quiet=True)
32
+ # Initialize VADER
33
+ sia = SentimentIntensityAnalyzer()
34
+
35
+ # Function to get VADER sentiment scores
36
+ def get_vader_sentiment(text):
37
+ sentiment_score = sia.polarity_scores(text)
38
+ return sentiment_score["compound"]
39
+
40
+ # Apply the function to each row
41
+ vader_sentiment = dataset.df[dataset.text_column].apply(get_vader_sentiment)
42
+
43
+ fig = plt.figure()
44
+ ax = sns.kdeplot(
45
+ x=vader_sentiment,
46
+ fill=True,
47
+ common_norm=False,
48
+ palette="crest",
49
+ alpha=0.5,
50
+ linewidth=0,
51
+ )
52
+ ax.set_title(f"Sentiment score of {dataset.text_column} ")
53
+ ax.set_xlabel("Sentiment score")
54
+
55
+ plt.close("all")
56
+
57
+ return fig
@@ -0,0 +1,45 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import evaluate
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+
9
+ from validmind import tags, tasks
10
+
11
+
12
+ @tags("data_validation")
13
+ @tasks("nlp")
14
+ def Toxicity(dataset):
15
+ """
16
+ Analyzes the toxicity of text data within a dataset using a pre-trained toxicity model.
17
+
18
+ This method loads a toxicity evaluation model and applies it to each text entry
19
+ in the specified column of the dataset's dataframe. It returns a KDE plot visualizing the distribution
20
+ of toxicity scores across the dataset.
21
+
22
+ Args:
23
+ dataset (Dataset): A dataset object which must have a `df` attribute (a pandas DataFrame)
24
+ and a `text_column` attribute indicating the name of the column containing text.
25
+
26
+ Returns:
27
+ matplotlib.figure.Figure: A KDE plot visualizing the distribution of toxicity scores.
28
+ """
29
+ toxicity = evaluate.load("toxicity")
30
+ input_text = dataset.df[dataset.text_column]
31
+ toxicity_scores = toxicity.compute(predictions=list(input_text.values))["toxicity"]
32
+
33
+ fig = plt.figure()
34
+ ax = sns.kdeplot(
35
+ x=toxicity_scores,
36
+ fill=True,
37
+ common_norm=False,
38
+ palette="crest",
39
+ alpha=0.5,
40
+ linewidth=0,
41
+ )
42
+ ax.set_title(f"Toxicity score of {dataset.text_column} ")
43
+ ax.set_xlabel("Toxicity score")
44
+ plt.close("all")
45
+ return fig
@@ -153,7 +153,7 @@ def _get_run_method(func, inputs, params):
153
153
  test_id=self.test_id,
154
154
  description=inspect.getdoc(self),
155
155
  output_template=self.output_template,
156
- inputs=list(inputs.keys()),
156
+ inputs=self.get_accessed_inputs(),
157
157
  )
158
158
 
159
159
  return self.result
@@ -264,7 +264,7 @@ def metric(func_or_id):
264
264
  {
265
265
  "run": _get_run_method(func, inputs, params),
266
266
  "required_inputs": list(inputs.keys()),
267
- "default_parameters": params,
267
+ "default_params": {k: v["default"] for k, v in params.items()},
268
268
  "__doc__": description,
269
269
  "metadata": {
270
270
  "task_types": tasks,
@@ -2,116 +2,118 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import itertools
6
- from dataclasses import dataclass
7
-
8
5
  import evaluate
9
6
  import pandas as pd
10
7
  import plotly.graph_objects as go
11
8
 
12
- from validmind.vm_models import Figure, Metric
9
+ from validmind import tags, tasks
13
10
 
14
11
 
15
- @dataclass
16
- class BertScore(Metric):
12
+ @tags("nlp", "text_data", "visualization")
13
+ @tasks("text_classification", "text_summarization")
14
+ def BertScore(dataset, model):
17
15
  """
18
- Evaluates text generation models' performance by calculating precision, recall, and F1 score based on BERT
16
+ Evaluates the quality of machine-generated text using BERTScore metrics and visualizes the results through histograms
17
+ and bar charts, alongside compiling a comprehensive table of descriptive statistics for each BERTScore metric.
18
+
19
+ **Purpose:**
20
+ This function is designed to assess the quality of text generated by machine learning models using BERTScore metrics.
21
+ BERTScore evaluates text generation models' performance by calculating precision, recall, and F1 score based on BERT
19
22
  contextual embeddings.
20
23
 
21
- **Purpose**: The BERTScore metric is deployed to evaluate the competence of text generation models by focusing on
22
- the similarity between the reference and the generated text. It employs the contextual embeddings from BERT models
23
- to assess the similarity of the contents. This measures the extent to which a model has learned and can generate
24
- contextually relevant results.
25
-
26
- **Test Mechanism**: The true values derived from the model's test dataset and the model's predictions are employed
27
- in this metric. BERTScore calculates the precision, recall, and F1 score of the model considering the contextual
28
- similarity between the reference and the produced text. These scores are computed for each token in the predicted
29
- sentences as compared to the reference sentences, while considering the cosine similarity with BERT embeddings. A
30
- line plot depicting the score changes across row indexes is generated for each metric i.e., Precision, Recall, and
31
- F1 Score.
32
-
33
- **Signs of High Risk**:
34
- - Observable downward trend in Precision, Recall, or F1 Score.
35
- - Noticeable instability or fluctuation in these metrics. Lower Precision implies that predictions often
36
- incorporate irrelevant contexts.
37
- - Declining Recall suggests that the model frequently omits relevant contexts during predictions.
38
- - Lower F1 score signals poor overall performance in both precision and recall.
39
-
40
- **Strengths**:
41
- - BERTScore efficiently detects the quality of text that requires to comprehend the context, a common requirement
42
- in natural language processing tasks.
43
- - This metric advances beyond the simple n-gram matching and considers the semantic similarity in the context,
44
- thereby providing more meaningful evaluation results.
45
- - The integrated visualization function allows tracking of the performance trends across different prediction sets.
46
-
47
- **Limitations**:
48
- - Dependence on BERT model embeddings for BERTScore implies that if the base BERT model is not suitable for a
49
- specific task, it might impair the accuracy of BERTScore.
50
- - Despite being good at understanding semantics, it might be incapable of capturing certain nuances in text
51
- similarity that other metrics like BLEU or ROUGE could detect.
52
- - Can be computationally expensive due to the utilization of BERT embeddings.
24
+ **Test Mechanism:**
25
+ The function starts by extracting the true and predicted values from the provided dataset and model. It then initializes
26
+ the BERTScore evaluator. For each pair of true and predicted texts, the function calculates the BERTScore metrics and
27
+ compiles them into a dataframe. Histograms and bar charts are generated for each BERTScore metric (Precision, Recall,
28
+ and F1 Score) to visualize their distribution. Additionally, a table of descriptive statistics (mean, median, standard
29
+ deviation, minimum, and maximum) is compiled for each metric, providing a comprehensive summary of the model's performance.
30
+
31
+ **Signs of High Risk:**
32
+ - Consistently low scores across BERTScore metrics could indicate poor quality in the generated text, suggesting that the model
33
+ fails to capture the essential content of the reference texts.
34
+ - Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
35
+ - Low recall scores may indicate that important information from the reference text is being omitted.
36
+ - An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the model's ability
37
+ to balance informativeness and conciseness.
38
+
39
+ **Strengths:**
40
+ - Provides a multifaceted evaluation of text quality through different BERTScore metrics, offering a detailed view of model performance.
41
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the scores.
42
+ - Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
43
+
44
+ **Limitations:**
45
+ - BERTScore relies on the contextual embeddings from BERT models, which may not fully capture all nuances of text similarity.
46
+ - The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
47
+ - While useful for comparison, BERTScore metrics alone do not provide a complete assessment of a model's performance and should be
48
+ supplemented with other metrics and qualitative analysis.
53
49
  """
54
50
 
55
- name = "bert_score"
56
- required_inputs = ["model", "dataset"]
57
-
58
- def run(self):
59
- y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
60
- y_pred = self.inputs.dataset.y_pred(self.inputs.model)
61
-
62
- # Load the bert evaluation metric
63
- bert = evaluate.load("bertscore")
64
-
65
- # Compute the BLEU score
66
- bert_s = bert.compute(
67
- predictions=y_pred,
68
- references=y_true,
69
- lang="en",
51
+ # Extract true and predicted values
52
+ y_true = dataset.y
53
+ y_pred = dataset.y_pred(model)
54
+
55
+ # Ensure y_true and y_pred have the same length
56
+ if len(y_true) != len(y_pred):
57
+ min_length = min(len(y_true), len(y_pred))
58
+ y_true = y_true[:min_length]
59
+ y_pred = y_pred[:min_length]
60
+
61
+ # Load the BERT evaluation metric
62
+ bert = evaluate.load("bertscore")
63
+
64
+ # Compute the BERT score
65
+ bert_s = bert.compute(
66
+ predictions=y_pred,
67
+ references=y_true,
68
+ lang="en",
69
+ )
70
+
71
+ # Convert scores to a dataframe
72
+ metrics_df = pd.DataFrame(bert_s)
73
+ figures = []
74
+
75
+ # Generate histograms and bar charts for each score type
76
+ score_types = ["precision", "recall", "f1"]
77
+ score_names = ["Precision", "Recall", "F1 Score"]
78
+
79
+ for score_type, score_name in zip(score_types, score_names):
80
+ # Histogram
81
+ hist_fig = go.Figure(data=[go.Histogram(x=metrics_df[score_type])])
82
+ hist_fig.update_layout(
83
+ title=f"{score_name} Histogram",
84
+ xaxis_title=score_name,
85
+ yaxis_title="Count",
70
86
  )
87
+ figures.append(hist_fig)
71
88
 
72
- metrics_df = pd.DataFrame(bert_s)
73
- figures = []
74
-
75
- # Visualization part
76
- fig = go.Figure()
77
-
78
- # Adding the line plots
79
- fig.add_trace(
80
- go.Scatter(
81
- x=metrics_df.index,
82
- y=metrics_df["precision"],
83
- mode="lines+markers",
84
- name="Precision",
85
- )
86
- )
87
- fig.add_trace(
88
- go.Scatter(
89
- x=metrics_df.index,
90
- y=metrics_df["recall"],
91
- mode="lines+markers",
92
- name="Recall",
93
- )
94
- )
95
- fig.add_trace(
96
- go.Scatter(
97
- x=metrics_df.index,
98
- y=metrics_df["f1"],
99
- mode="lines+markers",
100
- name="F1 Score",
101
- )
102
- )
103
-
104
- fig.update_layout(
105
- title="Bert Scores for Each Row",
89
+ # Bar Chart
90
+ bar_fig = go.Figure(data=[go.Bar(x=metrics_df.index, y=metrics_df[score_type])])
91
+ bar_fig.update_layout(
92
+ title=f"{score_name} Bar Chart",
106
93
  xaxis_title="Row Index",
107
- yaxis_title="Score",
108
- )
109
- figures.append(
110
- Figure(
111
- for_object=self,
112
- key=self.key,
113
- figure=fig,
114
- )
94
+ yaxis_title=score_name,
115
95
  )
116
-
117
- return self.cache_results(figures=figures)
96
+ figures.append(bar_fig)
97
+
98
+ # Calculate statistics for each score type
99
+ stats_df = metrics_df.describe().loc[["mean", "50%", "max", "min", "std"]]
100
+ stats_df = stats_df.rename(
101
+ index={
102
+ "mean": "Mean Score",
103
+ "50%": "Median Score",
104
+ "max": "Max Score",
105
+ "min": "Min Score",
106
+ "std": "Standard Deviation",
107
+ }
108
+ ).T
109
+ stats_df["Count"] = len(metrics_df)
110
+
111
+ # Rename metrics for clarity
112
+ stats_df.index = stats_df.index.map(
113
+ {"precision": "Precision", "recall": "Recall", "f1": "F1 Score"}
114
+ )
115
+
116
+ # Create a DataFrame from all collected statistics
117
+ result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
118
+
119
+ return (result_df, *tuple(figures))
@@ -2,77 +2,106 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import evaluate
6
+ import pandas as pd
7
+ import plotly.graph_objects as go
8
8
 
9
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
9
+ from validmind import tags, tasks
10
10
 
11
11
 
12
- @dataclass
13
- class BleuScore(Metric):
12
+ @tags("nlp", "text_data", "visualization")
13
+ @tasks("text_classification", "text_summarization")
14
+ def BleuScore(dataset, model):
14
15
  """
15
- Assesses translation quality by comparing machine-translated sentences with human-translated ones using BLEU score.
16
-
17
- **Purpose**: The Bilingual Evaluation Understudy (BLEU) metric measures the quality of machine-translated text by
18
- comparing it to human-translated text. This comparison is done at the sentence level and is designed to bring
19
- machine translations closer to the quality of a professional human translation. It is commonly used in the field of
20
- translation evaluation, and its purpose is to assess the accuracy of a model's output against that of a benchmark.
21
-
22
- **Test Mechanism**: The BLEU score is implemented using the NLTK's word_tokenize function to split the text into
23
- individual words. After tokenization, the evaluate library's BLEU metric calculates the BLEU score for each
24
- translated sentence by comparing the model's translations (predictions) against the actual, correct translations
25
- (references). The test algorithm then combines these individual scores into a single score that represents the
26
- average 'distance' between the generated translations and the human translations across the entire test set.
27
-
28
- **Signs of High Risk**:
29
- - Low BLEU scores suggest high model risk. This could indicate significant discrepancies between the machine
30
- translation and its human equivalent.
31
- - This could be due to ineffective model learning, overfitting of training data, or inadequate handling of the
32
- language's nuances.
33
- - Machine biases toward a certain language style or translation mode can result in lower scores.
34
-
35
- **Strengths**:
36
- - The BLEU score's primary strength lies in its simplicity and interpretability. It offers a straightforward way to
37
- assess translated text quality, and its calculations often align with human judgments.
38
- - The BLEU score breaks down its evaluations at the sentence level, offering granular insights into any errors.
39
- - The score consolidates the model’s performance into a single, comprehensive score, making it easy to compare and
40
- monitor.
41
-
42
- **Limitations**:
43
- - The BLEU score heavily favours exact matches, which can create a bias towards literal translations. Thus, it may
44
- fail to fully evaluate more complex or flexible translations that shy away from a word-for-word structure.
45
- - The score does not directly measure the intelligibility or grammatical correctness of the translations.
46
- - It may miss errors originating from subtle nuances in language, cultural contexts, or ambiguities.
16
+ Evaluates the quality of machine-generated text using BLEU metrics and visualizes the results through histograms
17
+ and bar charts, alongside compiling a comprehensive table of descriptive statistics for BLEU scores.
18
+
19
+ **Purpose:**
20
+ This function is designed to assess the quality of text generated by machine learning models using the BLEU metric.
21
+ BLEU, which stands for Bilingual Evaluation Understudy, is a metric used to evaluate the overlap of n-grams between
22
+ the machine-generated text and reference texts. This evaluation is crucial for tasks such as text summarization,
23
+ machine translation, and text generation, where the goal is to produce text that accurately reflects the content
24
+ and meaning of human-crafted references.
25
+
26
+ **Test Mechanism:**
27
+ The function starts by extracting the true and predicted values from the provided dataset and model. It then initializes
28
+ the BLEU evaluator. For each pair of true and predicted texts, the function calculates the BLEU scores and compiles them
29
+ into a dataframe. Histograms and bar charts are generated for the BLEU scores to visualize their distribution. Additionally,
30
+ a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the BLEU scores,
31
+ providing a comprehensive summary of the model's performance.
32
+
33
+ **Signs of High Risk:**
34
+ - Consistently low BLEU scores could indicate poor quality in the generated text, suggesting that the model fails to capture
35
+ the essential content of the reference texts.
36
+ - Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
37
+ - Low recall scores may indicate that important information from the reference text is being omitted.
38
+ - An imbalanced performance between precision and recall, reflected by a low BLEU score, could signal issues in the model's
39
+ ability to balance informativeness and conciseness.
40
+
41
+ **Strengths:**
42
+ - Provides a straightforward and widely-used evaluation of text quality through BLEU scores.
43
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the scores.
44
+ - Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
45
+
46
+ **Limitations:**
47
+ - BLEU metrics primarily focus on n-gram overlap and may not fully capture semantic coherence, fluency, or grammatical quality
48
+ of the text.
49
+ - The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
50
+ - While useful for comparison, BLEU scores alone do not provide a complete assessment of a model's performance and should be
51
+ supplemented with other metrics and qualitative analysis.
47
52
  """
48
53
 
49
- name = "bleu_score"
50
- required_inputs = ["model", "dataset"]
54
+ # Extract true and predicted values
55
+ y_true = dataset.y
56
+ y_pred = dataset.y_pred(model)
51
57
 
52
- def run(self):
53
- # Load the BLEU evaluation metric
54
- bleu = evaluate.load("bleu")
58
+ # Load the BLEU evaluation metric
59
+ bleu = evaluate.load("bleu")
55
60
 
61
+ # Calculate BLEU scores
62
+ score_list = []
63
+ for y_t, y_p in zip(y_true, y_pred):
56
64
  # Compute the BLEU score
57
- bleu = bleu.compute(
58
- predictions=self.inputs.dataset.y_pred(self.inputs.model),
59
- references=self.inputs.dataset.y,
60
- )
61
- return self.cache_results(metric_value={"blue_score_metric": bleu})
62
-
63
- def summary(self, metric_value):
64
- """
65
- Build one table for summarizing the bleu score results
66
- """
67
- summary_bleu_score = metric_value["blue_score_metric"]
68
-
69
- table = []
70
- table.append(summary_bleu_score)
71
- return ResultSummary(
72
- results=[
73
- ResultTable(
74
- data=table,
75
- metadata=ResultTableMetadata(title="Bleu score Results"),
76
- ),
77
- ]
78
- )
65
+ score = bleu.compute(predictions=[y_p], references=[[y_t]])
66
+ score_list.append(score["bleu"])
67
+
68
+ # Convert scores to a dataframe
69
+ metrics_df = pd.DataFrame(score_list, columns=["BLEU Score"])
70
+
71
+ figures = []
72
+
73
+ # Histogram for BLEU Score
74
+ hist_fig = go.Figure(data=[go.Histogram(x=metrics_df["BLEU Score"])])
75
+ hist_fig.update_layout(
76
+ title="BLEU Score Histogram",
77
+ xaxis_title="BLEU Score",
78
+ yaxis_title="Count",
79
+ )
80
+ figures.append(hist_fig)
81
+
82
+ # Bar Chart for BLEU Score
83
+ bar_fig = go.Figure(data=[go.Bar(x=metrics_df.index, y=metrics_df["BLEU Score"])])
84
+ bar_fig.update_layout(
85
+ title="BLEU Score Bar Chart",
86
+ xaxis_title="Row Index",
87
+ yaxis_title="BLEU Score",
88
+ )
89
+ figures.append(bar_fig)
90
+
91
+ # Calculate statistics for BLEU Score
92
+ stats_df = metrics_df.describe().loc[["mean", "50%", "max", "min", "std"]]
93
+ stats_df = stats_df.rename(
94
+ index={
95
+ "mean": "Mean Score",
96
+ "50%": "Median Score",
97
+ "max": "Max Score",
98
+ "min": "Min Score",
99
+ "std": "Standard Deviation",
100
+ }
101
+ ).T
102
+ stats_df["Count"] = len(metrics_df)
103
+
104
+ # Create a DataFrame from all collected statistics
105
+ result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
106
+
107
+ return (result_df, *tuple(figures))