PyPI - validmind - Versions diffs - 2.1.1__py3-none-any.whl → 2.2.4__py3-none-any.whl - Mend

validmind 2.1.1py3-none-any.whl → 2.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

validmind/__version__.py +1 -1
validmind/ai.py +72 -49
validmind/api_client.py +42 -16
validmind/client.py +68 -25
validmind/datasets/llm/rag/__init__.py +11 -0
validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
validmind/datasets/llm/rag/rfp.py +41 -0
validmind/errors.py +1 -1
validmind/html_templates/__init__.py +0 -0
validmind/html_templates/content_blocks.py +89 -14
validmind/models/__init__.py +7 -4
validmind/models/foundation.py +8 -34
validmind/models/function.py +51 -0
validmind/models/huggingface.py +16 -46
validmind/models/metadata.py +42 -0
validmind/models/pipeline.py +66 -0
validmind/models/pytorch.py +8 -42
validmind/models/r_model.py +33 -82
validmind/models/sklearn.py +39 -38
validmind/template.py +8 -26
validmind/tests/__init__.py +43 -20
validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
validmind/tests/data_validation/Duplicates.py +1 -1
validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
validmind/tests/data_validation/nlp/Punctuations.py +11 -12
validmind/tests/data_validation/nlp/Sentiment.py +57 -0
validmind/tests/data_validation/nlp/Toxicity.py +45 -0
validmind/tests/decorator.py +12 -7
validmind/tests/model_validation/BertScore.py +100 -98
validmind/tests/model_validation/BleuScore.py +93 -64
validmind/tests/model_validation/ContextualRecall.py +74 -91
validmind/tests/model_validation/MeteorScore.py +86 -74
validmind/tests/model_validation/RegardScore.py +103 -121
validmind/tests/model_validation/RougeScore.py +118 -0
validmind/tests/model_validation/TokenDisparity.py +84 -121
validmind/tests/model_validation/ToxicityScore.py +109 -123
validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
validmind/tests/model_validation/ragas/utils.py +66 -0
validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -11
validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
validmind/unit_metrics/__init__.py +26 -49
validmind/unit_metrics/composite.py +13 -7
validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
validmind/utils.py +99 -6
validmind/vm_models/__init__.py +1 -1
validmind/vm_models/dataset/__init__.py +7 -0
validmind/vm_models/dataset/dataset.py +560 -0
validmind/vm_models/dataset/utils.py +146 -0
validmind/vm_models/model.py +97 -72
validmind/vm_models/test/metric.py +9 -24
validmind/vm_models/test/result_wrapper.py +124 -28
validmind/vm_models/test/threshold_test.py +10 -28
validmind/vm_models/test_context.py +1 -1
validmind/vm_models/test_suite/summary.py +3 -4
{validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/METADATA +5 -3
{validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/RECORD +103 -78
validmind/models/catboost.py +0 -33
validmind/models/statsmodels.py +0 -50
validmind/models/xgboost.py +0 -30
validmind/tests/model_validation/BertScoreAggregate.py +0 -90
validmind/tests/model_validation/RegardHistogram.py +0 -148
validmind/tests/model_validation/RougeMetrics.py +0 -147
validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
validmind/tests/model_validation/ToxicityHistogram.py +0 -136
validmind/vm_models/dataset.py +0 -1303
{validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/LICENSE +0 -0
{validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/WHEEL +0 -0
{validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/entry_points.txt +0 -0

validmind/tests/model_validation/BertScore.py CHANGED Viewed

@@ -2,116 +2,118 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-import itertools
-from dataclasses import dataclass
 import evaluate
 import pandas as pd
 import plotly.graph_objects as go
-from validmind.vm_models import Figure, Metric
+from validmind import tags, tasks
-@dataclass
-class BertScore(Metric):
+@tags("nlp", "text_data", "visualization")
+@tasks("text_classification", "text_summarization")
+def BertScore(dataset, model):
     """
-    Evaluates text generation models' performance by calculating precision, recall, and F1 score based on BERT
+    Evaluates the quality of machine-generated text using BERTScore metrics and visualizes the results through histograms
+    and bar charts, alongside compiling a comprehensive table of descriptive statistics for each BERTScore metric.
+    **Purpose:**
+    This function is designed to assess the quality of text generated by machine learning models using BERTScore metrics.
+    BERTScore evaluates text generation models' performance by calculating precision, recall, and F1 score based on BERT
     contextual embeddings.
-    **Purpose**: The BERTScore metric is deployed to evaluate the competence of text generation models by focusing on
-    the similarity between the reference and the generated text. It employs the contextual embeddings from BERT models
-    to assess the similarity of the contents. This measures the extent to which a model has learned and can generate
-    contextually relevant results.
-    **Test Mechanism**: The true values derived from the model's test dataset and the model's predictions are employed
-    in this metric. BERTScore calculates the precision, recall, and F1 score of the model considering the contextual
-    similarity between the reference and the produced text. These scores are computed for each token in the predicted
-    sentences as compared to the reference sentences, while considering the cosine similarity with BERT embeddings. A
-    line plot depicting the score changes across row indexes is generated for each metric i.e., Precision, Recall, and
-    F1 Score.
-    **Signs of High Risk**:
-    - Observable downward trend in Precision, Recall, or F1 Score.
-    - Noticeable instability or fluctuation in these metrics. Lower Precision implies that predictions often
-    incorporate irrelevant contexts.
-    - Declining Recall suggests that the model frequently omits relevant contexts during predictions.
-    - Lower F1 score signals poor overall performance in both precision and recall.
-    **Strengths**:
-    - BERTScore efficiently detects the quality of text that requires to comprehend the context, a common requirement
-    in natural language processing tasks.
-    - This metric advances beyond the simple n-gram matching and considers the semantic similarity in the context,
-    thereby providing more meaningful evaluation results.
-    - The integrated visualization function allows tracking of the performance trends across different prediction sets.
-    **Limitations**:
-    - Dependence on BERT model embeddings for BERTScore implies that if the base BERT model is not suitable for a
-    specific task, it might impair the accuracy of BERTScore.
-    - Despite being good at understanding semantics, it might be incapable of capturing certain nuances in text
-    similarity that other metrics like BLEU or ROUGE could detect.
-    - Can be computationally expensive due to the utilization of BERT embeddings.
+    **Test Mechanism:**
+    The function starts by extracting the true and predicted values from the provided dataset and model. It then initializes
+    the BERTScore evaluator. For each pair of true and predicted texts, the function calculates the BERTScore metrics and
+    compiles them into a dataframe. Histograms and bar charts are generated for each BERTScore metric (Precision, Recall,
+    and F1 Score) to visualize their distribution. Additionally, a table of descriptive statistics (mean, median, standard
+    deviation, minimum, and maximum) is compiled for each metric, providing a comprehensive summary of the model's performance.
+    **Signs of High Risk:**
+    - Consistently low scores across BERTScore metrics could indicate poor quality in the generated text, suggesting that the model
+      fails to capture the essential content of the reference texts.
+    - Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
+    - Low recall scores may indicate that important information from the reference text is being omitted.
+    - An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the model's ability
+      to balance informativeness and conciseness.
+    **Strengths:**
+    - Provides a multifaceted evaluation of text quality through different BERTScore metrics, offering a detailed view of model performance.
+    - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the scores.
+    - Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
+    **Limitations:**
+    - BERTScore relies on the contextual embeddings from BERT models, which may not fully capture all nuances of text similarity.
+    - The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
+    - While useful for comparison, BERTScore metrics alone do not provide a complete assessment of a model's performance and should be
+      supplemented with other metrics and qualitative analysis.
     """
-    name = "bert_score"
-    required_inputs = ["model", "dataset"]
-    def run(self):
-        y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
-        y_pred = self.inputs.dataset.y_pred(self.inputs.model)
-        # Load the bert evaluation metric
-        bert = evaluate.load("bertscore")
-        # Compute the BLEU score
-        bert_s = bert.compute(
-            predictions=y_pred,
-            references=y_true,
-            lang="en",
+    # Extract true and predicted values
+    y_true = dataset.y
+    y_pred = dataset.y_pred(model)
+    # Ensure y_true and y_pred have the same length
+    if len(y_true) != len(y_pred):
+        min_length = min(len(y_true), len(y_pred))
+        y_true = y_true[:min_length]
+        y_pred = y_pred[:min_length]
+    # Load the BERT evaluation metric
+    bert = evaluate.load("bertscore")
+    # Compute the BERT score
+    bert_s = bert.compute(
+        predictions=y_pred,
+        references=y_true,
+        lang="en",
+    )
+    # Convert scores to a dataframe
+    metrics_df = pd.DataFrame(bert_s)
+    figures = []
+    # Generate histograms and bar charts for each score type
+    score_types = ["precision", "recall", "f1"]
+    score_names = ["Precision", "Recall", "F1 Score"]
+    for score_type, score_name in zip(score_types, score_names):
+        # Histogram
+        hist_fig = go.Figure(data=[go.Histogram(x=metrics_df[score_type])])
+        hist_fig.update_layout(
+            title=f"{score_name} Histogram",
+            xaxis_title=score_name,
+            yaxis_title="Count",
         )
+        figures.append(hist_fig)
-        metrics_df = pd.DataFrame(bert_s)
-        figures = []
-        # Visualization part
-        fig = go.Figure()
-        # Adding the line plots
-        fig.add_trace(
-            go.Scatter(
-                x=metrics_df.index,
-                y=metrics_df["precision"],
-                mode="lines+markers",
-                name="Precision",
-            )
-        )
-        fig.add_trace(
-            go.Scatter(
-                x=metrics_df.index,
-                y=metrics_df["recall"],
-                mode="lines+markers",
-                name="Recall",
-            )
-        )
-        fig.add_trace(
-            go.Scatter(
-                x=metrics_df.index,
-                y=metrics_df["f1"],
-                mode="lines+markers",
-                name="F1 Score",
-            )
-        )
-        fig.update_layout(
-            title="Bert Scores for Each Row",
+        # Bar Chart
+        bar_fig = go.Figure(data=[go.Bar(x=metrics_df.index, y=metrics_df[score_type])])
+        bar_fig.update_layout(
+            title=f"{score_name} Bar Chart",
             xaxis_title="Row Index",
-            yaxis_title="Score",
-        )
-        figures.append(
-            Figure(
-                for_object=self,
-                key=self.key,
-                figure=fig,
-            )
+            yaxis_title=score_name,
         )
-        return self.cache_results(figures=figures)
+        figures.append(bar_fig)
+    # Calculate statistics for each score type
+    stats_df = metrics_df.describe().loc[["mean", "50%", "max", "min", "std"]]
+    stats_df = stats_df.rename(
+        index={
+            "mean": "Mean Score",
+            "50%": "Median Score",
+            "max": "Max Score",
+            "min": "Min Score",
+            "std": "Standard Deviation",
+        }
+    ).T
+    stats_df["Count"] = len(metrics_df)
+    # Rename metrics for clarity
+    stats_df.index = stats_df.index.map(
+        {"precision": "Precision", "recall": "Recall", "f1": "F1 Score"}
+    )
+    # Create a DataFrame from all collected statistics
+    result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
+    return (result_df, *tuple(figures))

validmind/tests/model_validation/BleuScore.py CHANGED Viewed

@@ -2,77 +2,106 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from dataclasses import dataclass
 import evaluate
+import pandas as pd
+import plotly.graph_objects as go
-from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
+from validmind import tags, tasks
-@dataclass
-class BleuScore(Metric):
+@tags("nlp", "text_data", "visualization")
+@tasks("text_classification", "text_summarization")
+def BleuScore(dataset, model):
     """
-    Assesses translation quality by comparing machine-translated sentences with human-translated ones using BLEU score.
-    **Purpose**: The Bilingual Evaluation Understudy (BLEU) metric measures the quality of machine-translated text by
-    comparing it to human-translated text. This comparison is done at the sentence level and is designed to bring
-    machine translations closer to the quality of a professional human translation. It is commonly used in the field of
-    translation evaluation, and its purpose is to assess the accuracy of a model's output against that of a benchmark.
-    **Test Mechanism**: The BLEU score is implemented using the NLTK's word_tokenize function to split the text into
-    individual words. After tokenization, the evaluate library's BLEU metric calculates the BLEU score for each
-    translated sentence by comparing the model's translations (predictions) against the actual, correct translations
-    (references). The test algorithm then combines these individual scores into a single score that represents the
-    average 'distance' between the generated translations and the human translations across the entire test set.
-    **Signs of High Risk**:
-    - Low BLEU scores suggest high model risk. This could indicate significant discrepancies between the machine
-    translation and its human equivalent.
-    - This could be due to ineffective model learning, overfitting of training data, or inadequate handling of the
-    language's nuances.
-    - Machine biases toward a certain language style or translation mode can result in lower scores.
-    **Strengths**:
-    - The BLEU score's primary strength lies in its simplicity and interpretability. It offers a straightforward way to
-    assess translated text quality, and its calculations often align with human judgments.
-    - The BLEU score breaks down its evaluations at the sentence level, offering granular insights into any errors.
-    - The score consolidates the model’s performance into a single, comprehensive score, making it easy to compare and
-    monitor.
-    **Limitations**:
-    - The BLEU score heavily favours exact matches, which can create a bias towards literal translations. Thus, it may
-    fail to fully evaluate more complex or flexible translations that shy away from a word-for-word structure.
-    - The score does not directly measure the intelligibility or grammatical correctness of the translations.
-    - It may miss errors originating from subtle nuances in language, cultural contexts, or ambiguities.
+    Evaluates the quality of machine-generated text using BLEU metrics and visualizes the results through histograms
+    and bar charts, alongside compiling a comprehensive table of descriptive statistics for BLEU scores.
+    **Purpose:**
+    This function is designed to assess the quality of text generated by machine learning models using the BLEU metric.
+    BLEU, which stands for Bilingual Evaluation Understudy, is a metric used to evaluate the overlap of n-grams between
+    the machine-generated text and reference texts. This evaluation is crucial for tasks such as text summarization,
+    machine translation, and text generation, where the goal is to produce text that accurately reflects the content
+    and meaning of human-crafted references.
+    **Test Mechanism:**
+    The function starts by extracting the true and predicted values from the provided dataset and model. It then initializes
+    the BLEU evaluator. For each pair of true and predicted texts, the function calculates the BLEU scores and compiles them
+    into a dataframe. Histograms and bar charts are generated for the BLEU scores to visualize their distribution. Additionally,
+    a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the BLEU scores,
+    providing a comprehensive summary of the model's performance.
+    **Signs of High Risk:**
+    - Consistently low BLEU scores could indicate poor quality in the generated text, suggesting that the model fails to capture
+      the essential content of the reference texts.
+    - Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
+    - Low recall scores may indicate that important information from the reference text is being omitted.
+    - An imbalanced performance between precision and recall, reflected by a low BLEU score, could signal issues in the model's
+      ability to balance informativeness and conciseness.
+    **Strengths:**
+    - Provides a straightforward and widely-used evaluation of text quality through BLEU scores.
+    - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the scores.
+    - Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
+    **Limitations:**
+    - BLEU metrics primarily focus on n-gram overlap and may not fully capture semantic coherence, fluency, or grammatical quality
+      of the text.
+    - The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
+    - While useful for comparison, BLEU scores alone do not provide a complete assessment of a model's performance and should be
+      supplemented with other metrics and qualitative analysis.
     """
-    name = "bleu_score"
-    required_inputs = ["model", "dataset"]
+    # Extract true and predicted values
+    y_true = dataset.y
+    y_pred = dataset.y_pred(model)
-    def run(self):
-        # Load the BLEU evaluation metric
-        bleu = evaluate.load("bleu")
+    # Load the BLEU evaluation metric
+    bleu = evaluate.load("bleu")
+    # Calculate BLEU scores
+    score_list = []
+    for y_t, y_p in zip(y_true, y_pred):
         # Compute the BLEU score
-        bleu = bleu.compute(
-            predictions=self.inputs.dataset.y_pred(self.inputs.model),
-            references=self.inputs.dataset.y,
-        )
-        return self.cache_results(metric_value={"blue_score_metric": bleu})
-    def summary(self, metric_value):
-        """
-        Build one table for summarizing the bleu score results
-        """
-        summary_bleu_score = metric_value["blue_score_metric"]
-        table = []
-        table.append(summary_bleu_score)
-        return ResultSummary(
-            results=[
-                ResultTable(
-                    data=table,
-                    metadata=ResultTableMetadata(title="Bleu score Results"),
-                ),
-            ]
-        )
+        score = bleu.compute(predictions=[y_p], references=[[y_t]])
+        score_list.append(score["bleu"])
+    # Convert scores to a dataframe
+    metrics_df = pd.DataFrame(score_list, columns=["BLEU Score"])
+    figures = []
+    # Histogram for BLEU Score
+    hist_fig = go.Figure(data=[go.Histogram(x=metrics_df["BLEU Score"])])
+    hist_fig.update_layout(
+        title="BLEU Score Histogram",
+        xaxis_title="BLEU Score",
+        yaxis_title="Count",
+    )
+    figures.append(hist_fig)
+    # Bar Chart for BLEU Score
+    bar_fig = go.Figure(data=[go.Bar(x=metrics_df.index, y=metrics_df["BLEU Score"])])
+    bar_fig.update_layout(
+        title="BLEU Score Bar Chart",
+        xaxis_title="Row Index",
+        yaxis_title="BLEU Score",
+    )
+    figures.append(bar_fig)
+    # Calculate statistics for BLEU Score
+    stats_df = metrics_df.describe().loc[["mean", "50%", "max", "min", "std"]]
+    stats_df = stats_df.rename(
+        index={
+            "mean": "Mean Score",
+            "50%": "Median Score",
+            "max": "Max Score",
+            "min": "Min Score",
+            "std": "Standard Deviation",
+        }
+    ).T
+    stats_df["Count"] = len(metrics_df)
+    # Create a DataFrame from all collected statistics
+    result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
+    return (result_df, *tuple(figures))

validmind/tests/model_validation/ContextualRecall.py CHANGED Viewed

@@ -2,109 +2,92 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-import itertools
-from dataclasses import dataclass
 import nltk
 import pandas as pd
 import plotly.graph_objects as go
-from validmind.vm_models import Figure, Metric
+from validmind import tags, tasks
-@dataclass
-class ContextualRecall(Metric):
+@tags("nlp", "text_data", "visualization")
+@tasks("text_classification", "text_summarization")
+def ContextualRecall(dataset, model):
     """
-    Evaluates a Natural Language Generation model's ability to generate contextually relevant and factually correct
-    text.
-    **Purpose**:
-    The Contextual Recall metric is used to evaluate the ability of a natural language generation (NLG) model to
-    generate text that appropriately reflects the given context or prompt. It measures the model's capability to
-    remember and reproduce the main context in its resulting output. This metric is critical in natural language
-    processing tasks, as the coherency and contextuality of the generated text are essential.
-    **Test Mechanism**:
-    1. **Preparation of Reference and Candidate Texts**:
-        - **Reference Texts**: Gather the reference text(s) which exemplify the expected or ideal output for a specific
-    context or prompt.
-        - **Candidate Texts**: Generate candidate text(s) from the NLG model under evaluation using the same context.
-    2. **Tokenization and Preprocessing**:
-        - Tokenize the reference and candidate texts into discernible words or tokens using libraries such as NLTK.
-    3. **Computation of Contextual Recall**:
-        - Identify the token overlap between the reference and candidate texts.
-        - The Contextual Recall score is computed by dividing the number of overlapping tokens by the total number of
-    tokens in the reference text. Scores are calculated for each test dataset instance, resulting in an array of
-    scores. These scores are then visualized using a line plot to show score variations across different rows.
-    **Signs of High Risk**:
-    - Low contextual recall scores could indicate that the model is not effectively reflecting the original context in
-    its output, leading to incoherent or contextually misaligned text.
-    - A consistent trend of low recall scores could suggest underperformance of the model.
+    Evaluates a Natural Language Generation model's ability to generate contextually relevant and factually correct text, visualizing the results through histograms and bar charts, alongside compiling a comprehensive table of descriptive statistics for contextual recall scores.
-    **Strengths**:
+    **Purpose:**
+    The Contextual Recall metric is used to evaluate the ability of a natural language generation (NLG) model to generate text that appropriately reflects the given context or prompt. It measures the model's capability to remember and reproduce the main context in its resulting output. This metric is critical in natural language processing tasks, as the coherency and contextuality of the generated text are essential.
-    - The Contextual Recall metric provides a quantifiable measure of a model's adherence to the context and factual
-    elements of the generated narrative.
-    - This metric finds particular value in applications requiring deep comprehension of context, such as text
-    continuation or interactive dialogue systems.
-    - The line plot visualization provides a clear and intuitive representation of score fluctuations.
+    **Test Mechanism:**
+    The function starts by extracting the true and predicted values from the provided dataset and model. It then tokenizes the reference and candidate texts into discernible words or tokens using NLTK. The token overlap between the reference and candidate texts is identified, and the Contextual Recall score is computed by dividing the number of overlapping tokens by the total number of tokens in the reference text. Scores are calculated for each test dataset instance, resulting in an array of scores. These scores are visualized using a histogram and a bar chart to show score variations across different rows. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the contextual recall scores, providing a comprehensive summary of the model's performance.
+    **Signs of High Risk:**
+    - Low contextual recall scores could indicate that the model is not effectively reflecting the original context in its output, leading to incoherent or contextually misaligned text.
+    - A consistent trend of low recall scores could suggest underperformance of the model.
-    **Limitations**:
+    **Strengths:**
+    - Provides a quantifiable measure of a model's adherence to the context and factual elements of the generated narrative.
+    - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of contextual recall scores.
+    - Descriptive statistics offer a concise summary of the model's performance in generating contextually relevant texts.
-    - Despite its effectiveness, the Contextual Recall could fail to comprehensively assess the performance of NLG
-    models. Its focus on word overlap could result in high scores for texts that use many common words, even when these
-    texts lack coherence or meaningful context.
+    **Limitations:**
+    - The focus on word overlap could result in high scores for texts that use many common words, even when these texts lack coherence or meaningful context.
     - This metric does not consider the order of words, which could lead to overestimated scores for scrambled outputs.
     - Models that effectively use infrequent words might be undervalued, as these words might not overlap as often.
     """
-    name = "contextual_recall"
-    required_inputs = ["model", "dataset"]
-    def run(self):
-        y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
-        y_pred = self.inputs.dataset.y_pred(self.inputs.model)
-        score_list = []
-        for y_t, y_p in zip(y_true, y_pred):
-            # Tokenize the reference and candidate texts
-            reference_tokens = nltk.word_tokenize(y_t.lower())
-            candidate_tokens = nltk.word_tokenize(y_p.lower())
-            # Calculate overlapping tokens
-            overlapping_tokens = set(reference_tokens) & set(candidate_tokens)
-            # Compute contextual recall
-            score_list.append(len(overlapping_tokens) / len(reference_tokens))
-        metrics_df = pd.DataFrame(score_list, columns=["Contextual Recall"])
-        figures = []
-        # Visualization part
-        fig = go.Figure()
-        # Adding the line plots
-        fig.add_trace(
-            go.Scatter(
-                x=metrics_df.index,
-                y=metrics_df["Contextual Recall"],
-                mode="lines+markers",
-                name="Contextual Recall",
-            )
-        )
-        fig.update_layout(
-            title="Contextual Recall scores for each row",
-            xaxis_title="Row Index",
-            yaxis_title="Score",
-        )
-        figures.append(
-            Figure(
-                for_object=self,
-                key=self.key,
-                figure=fig,
-            )
-        )
-        return self.cache_results(figures=figures)
+    y_true = dataset.y
+    y_pred = dataset.y_pred(model)
+    score_list = []
+    for y_t, y_p in zip(y_true, y_pred):
+        # Tokenize the reference and candidate texts
+        reference_tokens = nltk.word_tokenize(y_t.lower())
+        candidate_tokens = nltk.word_tokenize(y_p.lower())
+        # Calculate overlapping tokens
+        overlapping_tokens = set(reference_tokens) & set(candidate_tokens)
+        # Compute contextual recall
+        score_list.append(len(overlapping_tokens) / len(reference_tokens))
+    metrics_df = pd.DataFrame(score_list, columns=["Contextual Recall"])
+    figures = []
+    # Histogram for Contextual Recall
+    hist_fig = go.Figure(data=[go.Histogram(x=metrics_df["Contextual Recall"])])
+    hist_fig.update_layout(
+        title="Contextual Recall Histogram",
+        xaxis_title="Contextual Recall",
+        yaxis_title="Count",
+    )
+    figures.append(hist_fig)
+    # Bar Chart for Contextual Recall
+    bar_fig = go.Figure(
+        data=[go.Bar(x=metrics_df.index, y=metrics_df["Contextual Recall"])]
+    )
+    bar_fig.update_layout(
+        title="Contextual Recall Bar Chart",
+        xaxis_title="Row Index",
+        yaxis_title="Contextual Recall",
+    )
+    figures.append(bar_fig)
+    # Calculate statistics for Contextual Recall
+    stats_df = metrics_df.describe().loc[["mean", "50%", "max", "min", "std"]]
+    stats_df = stats_df.rename(
+        index={
+            "mean": "Mean Score",
+            "50%": "Median Score",
+            "max": "Max Score",
+            "min": "Min Score",
+            "std": "Standard Deviation",
+        }
+    ).T
+    stats_df["Count"] = len(metrics_df)
+    # Create a DataFrame from all collected statistics
+    result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
+    return (result_df, *tuple(figures))

validmind 2.1.1__py3-none-any.whl → 2.2.4__py3-none-any.whl

validmind 2.1.1py3-none-any.whl → 2.2.4py3-none-any.whl