PyPI - validmind - Versions diffs - 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl - Mend

validmind 2.5.8py3-none-any.whl → 2.5.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (233) hide show

validmind/tests/model_validation/ragas/AspectCritique.py CHANGED Viewed

@@ -11,6 +11,8 @@ from validmind import tags, tasks
 from .utils import get_ragas_config, get_renamed_columns
+LOWER_IS_BETTER_ASPECTS = ["harmfulness", "maliciousness"]
 @tags("ragas", "llm", "qualitative")
 @tasks("text_summarization", "text_generation", "text_qa")
@@ -101,8 +103,8 @@ def AspectCritique(
     """
     try:
         from ragas import evaluate
-        from ragas.metrics.critique import AspectCritique as _AspectCritique
-        from ragas.metrics.critique import (
+        from ragas.metrics import AspectCritic
+        from ragas.metrics._aspect_critic import (
             coherence,
             conciseness,
             correctness,
@@ -112,7 +114,7 @@ def AspectCritique(
     except ImportError:
         raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
-    aspect_map = {
+    built_in_aspects = {
         "coherence": coherence,
         "conciseness": conciseness,
         "correctness": correctness,
@@ -134,21 +136,25 @@ def AspectCritique(
     df = get_renamed_columns(dataset._df, required_columns)
-    built_in_aspects = [aspect_map[aspect] for aspect in aspects]
     custom_aspects = (
         [
-            _AspectCritique(name=name, definition=description)
+            AspectCritic(name=name, definition=description)
             for name, description in additional_aspects
         ]
         if additional_aspects
         else []
     )
-    all_aspects = [*built_in_aspects, *custom_aspects]
+    all_aspects = [built_in_aspects[aspect] for aspect in aspects] + custom_aspects
     result_df = evaluate(
         Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config()
     ).to_pandas()
+    # reverse the score for aspects where lower is better
+    for aspect in LOWER_IS_BETTER_ASPECTS:
+        if aspect in result_df.columns:
+            result_df[aspect] = 1 - result_df[aspect]
     df_melted = result_df.melt(
         id_vars=["question", "answer", "contexts"],
         value_vars=[aspect.name for aspect in all_aspects],

validmind/tests/model_validation/ragas/ContextEntityRecall.py CHANGED Viewed

@@ -47,6 +47,7 @@ def ContextEntityRecall(
     ### Configuring Columns
     This metric requires the following columns in your dataset:
     - `contexts` (List[str]): A list of text contexts which will be evaluated to make
     sure if they contain the entities present in the ground truth.
     - `ground_truth` (str): The ground truth text from which the entities will be
@@ -113,13 +114,13 @@ def ContextEntityRecall(
     return (
         {
-            "Scores (will not be uploaded to UI)": result_df[
-                [
-                    "contexts",
-                    "ground_truth",
-                    "context_entity_recall",
-                ]
-            ],
+            # "Scores (will not be uploaded to UI)": result_df[
+            #     [
+            #         "contexts",
+            #         "ground_truth",
+            #         "context_entity_recall",
+            #     ]
+            # ],
             "Aggregate Scores": [
                 {
                     "Mean Score": result_df["context_entity_recall"].mean(),
@@ -127,7 +128,7 @@ def ContextEntityRecall(
                     "Max Score": result_df["context_entity_recall"].max(),
                     "Min Score": result_df["context_entity_recall"].min(),
                     "Standard Deviation": result_df["context_entity_recall"].std(),
-                    "Count": len(result_df),
+                    "Count": result_df.shape[0],
                 }
             ],
         },

validmind/tests/model_validation/ragas/ContextPrecision.py CHANGED Viewed

@@ -40,6 +40,7 @@ def ContextPrecision(
     ### Configuring Columns
     This metric requires the following columns in your dataset:
     - `question` (str): The text query that was input into the model.
     - `contexts` (List[str]): A list of text contexts which are retrieved and which
     will be evaluated to make sure they contain relevant info in the correct order.
@@ -107,9 +108,9 @@ def ContextPrecision(
     return (
         {
-            "Scores (will not be uploaded to UI)": result_df[
-                ["question", "contexts", "ground_truth", "context_precision"]
-            ],
+            # "Scores (will not be uploaded to UI)": result_df[
+            #     ["question", "contexts", "ground_truth", "context_precision"]
+            # ],
             "Aggregate Scores": [
                 {
                     "Mean Score": result_df["context_precision"].mean(),
@@ -117,7 +118,7 @@ def ContextPrecision(
                     "Max Score": result_df["context_precision"].max(),
                     "Min Score": result_df["context_precision"].min(),
                     "Standard Deviation": result_df["context_precision"].std(),
-                    "Count": len(result_df),
+                    "Count": result_df.shape[0],
                 }
             ],
         },

validmind/tests/model_validation/ragas/ContextRecall.py CHANGED Viewed

@@ -40,6 +40,7 @@ def ContextRecall(
     ### Configuring Columns
     This metric requires the following columns in your dataset:
     - `question` (str): The text query that was input into the model.
     - `contexts` (List[str]): A list of text contexts which are retrieved and which
     will be evaluated to make sure they contain all items in the ground truth.
@@ -107,9 +108,9 @@ def ContextRecall(
     return (
         {
-            "Scores (will not be uploaded to UI)": result_df[
-                ["question", "contexts", "ground_truth", "context_recall"]
-            ],
+            # "Scores (will not be uploaded to UI)": result_df[
+            #     ["question", "contexts", "ground_truth", "context_recall"]
+            # ],
             "Aggregate Scores": [
                 {
                     "Mean Score": result_df["context_recall"].mean(),
@@ -117,7 +118,7 @@ def ContextRecall(
                     "Max Score": result_df["context_recall"].max(),
                     "Min Score": result_df["context_recall"].min(),
                     "Standard Deviation": result_df["context_recall"].std(),
-                    "Count": len(result_df),
+                    "Count": result_df.shape[0],
                 }
             ],
         },

validmind/tests/model_validation/ragas/ContextUtilization.py ADDED Viewed

@@ -0,0 +1,155 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import warnings
+import plotly.express as px
+from datasets import Dataset
+from validmind import tags, tasks
+from .utils import get_ragas_config, get_renamed_columns
+@tags("ragas", "llm", "retrieval_performance")
+@tasks("text_qa", "text_generation", "text_summarization", "text_classification")
+def ContextUtilization(
+    dataset,
+    question_column: str = "question",
+    contexts_column: str = "contexts",
+    answer_column: str = "answer",
+):  # noqa: B950
+    """
+    Assesses how effectively relevant context chunks are utilized in generating answers by evaluating their ranking
+    within the provided contexts.
+    ### Purpose
+    The Context Utilization test evaluates whether all of the answer-relevant items present in the contexts are ranked
+    higher within the provided retrieval results. This metric is essential for assessing the performance of models,
+    especially those involved in tasks such as text QA, text generation, text summarization, and text classification.
+    ### Test Mechanism
+    The test calculates Context Utilization using the formula:
+    $$
+    \\text{Context Utilization@K} = \\frac{\\sum_{k=1}^{K} \\left( \\text{Precision@k} \\times v_k \\right)}{\\text{Total number of relevant items in the top } K \\text{ results}}
+    $$
+    $$
+    \\text{Precision@k} = {\\text{true positives@k} \\over  (\\text{true positives@k} + \\text{false positives@k})}
+    $$
+    Where $K$ is the total number of chunks in `contexts` and $v_k \\in \\{0, 1\\}$ is the relevance indicator at rank $k$.
+    This test uses columns for questions, contexts, and answers from the dataset and computes context utilization
+    scores, generating a histogram and box plot for visualization.
+    #### Configuring Columns
+    This metric requires the following columns in your dataset:
+    - `question` (str): The text query that was input into the model.
+    - `contexts` (List[str]): A list of text contexts which are retrieved and which will be evaluated to
+       make sure they contain relevant info in the correct order.
+    - `answer` (str): The llm-generated response for the input `question`.
+    If the above data is not in the appropriate column, you can specify different column
+    names for these fields using the parameters `question_column`, `contexts_column`
+    and `ground_truth_column`.
+    For example, if your dataset has this data stored in different columns, you can
+    pass the following parameters:
+    ```python
+    {
+        "question_column": "question",
+        "contexts_column": "context_info"
+        "ground_truth_column": "my_ground_truth_col",
+    }
+    ```
+    If the data is stored as a dictionary in another column, specify the column and key
+    like this:
+    ```python
+    pred_col = dataset.prediction_column(model)
+    params = {
+        "contexts_column": f"{pred_col}.contexts",
+        "ground_truth_column": "my_ground_truth_col",
+    }
+    ```
+    For more complex situations, you can use a function to extract the data:
+    ```python
+    pred_col = dataset.prediction_column(model)
+    params = {
+        "contexts_column": lambda x: [x[pred_col]["context_message"]],
+        "ground_truth_column": "my_ground_truth_col",
+    }
+    ```
+    ### Signs of High Risk
+    - Very low mean or median context utilization scores, indicating poor usage of retrieved contexts.
+    - High standard deviation, suggesting inconsistent model performance.
+    - Low or minimal max scores, pointing to the model's failure to rank relevant contexts at top positions.
+    ### Strengths
+    - Quantifies the rank of relevant context chunks in generating responses.
+    - Provides clear visualizations through histograms and box plots for ease of interpretation.
+    - Adapts to different dataset schema by allowing configurable column names.
+    ### Limitations
+    - Assumes the relevance of context chunks is binary and may not capture nuances of partial relevance.
+    - Requires proper context retrieval to be effective; irrelevant context chunks can skew the results.
+    - Dependent on large sample sizes to provide stable and reliable estimates of utilization performance.
+    """
+    try:
+        from ragas import evaluate
+        from ragas.metrics import context_utilization
+    except ImportError:
+        raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
+    warnings.filterwarnings(
+        "ignore",
+        category=FutureWarning,
+        message="promote has been superseded by promote_options='default'.",
+    )
+    required_columns = {
+        "question": question_column,
+        "contexts": contexts_column,
+        "answer": answer_column,
+    }
+    df = get_renamed_columns(dataset._df, required_columns)
+    result_df = evaluate(
+        Dataset.from_pandas(df), metrics=[context_utilization], **get_ragas_config()
+    ).to_pandas()
+    fig_histogram = px.histogram(x=result_df["context_utilization"].to_list(), nbins=10)
+    fig_box = px.box(x=result_df["context_utilization"].to_list())
+    return (
+        {
+            # "Scores (will not be uploaded to UI)": result_df[
+            #     ["question", "contexts", "answer", "context_utilization"]
+            # ],
+            "Aggregate Scores": [
+                {
+                    "Mean Score": result_df["context_utilization"].mean(),
+                    "Median Score": result_df["context_utilization"].median(),
+                    "Max Score": result_df["context_utilization"].max(),
+                    "Min Score": result_df["context_utilization"].min(),
+                    "Standard Deviation": result_df["context_utilization"].std(),
+                    "Count": result_df.shape[0],
+                }
+            ],
+        },
+        fig_histogram,
+        fig_box,
+    )

validmind/tests/model_validation/ragas/Faithfulness.py CHANGED Viewed

@@ -41,6 +41,7 @@ def Faithfulness(
     ### Configuring Columns
     This metric requires the following columns in your dataset:
     - `contexts` (List[str]): A list of text contexts which are retrieved to generate
     the answer.
     - `answer` (str): The response generated by the model which will be evaluated for
@@ -105,9 +106,9 @@ def Faithfulness(
     return (
         {
-            "Scores (will not be uploaded to UI)": result_df[
-                ["contexts", "answer", "faithfulness"]
-            ],
+            # "Scores (will not be uploaded to UI)": result_df[
+            #     ["contexts", "answer", "faithfulness"]
+            # ],
             "Aggregate Scores": [
                 {
                     "Mean Score": result_df["faithfulness"].mean(),
@@ -115,7 +116,7 @@ def Faithfulness(
                     "Max Score": result_df["faithfulness"].max(),
                     "Min Score": result_df["faithfulness"].min(),
                     "Standard Deviation": result_df["faithfulness"].std(),
-                    "Count": len(result_df),
+                    "Count": result_df.shape[0],
                 }
             ],
         },

validmind/tests/model_validation/ragas/NoiseSensitivity.py ADDED Viewed

@@ -0,0 +1,152 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import warnings
+import plotly.express as px
+from datasets import Dataset
+from validmind import tags, tasks
+from .utils import get_ragas_config, get_renamed_columns
+@tags("ragas", "llm", "rag_performance")
+@tasks("text_qa", "text_generation", "text_summarization")
+def NoiseSensitivity(
+    dataset,
+    answer_column="answer",
+    contexts_column="contexts",
+    ground_truth_column="ground_truth",
+):
+    """
+    Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
+    generates incorrect responses.
+    ### Purpose
+    The Noise Sensitivity test aims to measure how sensitive an LLM is to irrelevant or noisy information within the
+    contextual data used to generate its responses. A lower noise sensitivity score suggests better model robustness in
+    generating accurate answers from given contexts.
+    ### Test Mechanism
+    This test evaluates the model's answers by comparing the claims made in the generated response against the ground
+    truth and the retrieved context. The noise sensitivity score is calculated as:
+    $$
+    \\text{noise sensitivity} = {|\\text{Number of incorrect claims in answer}| \\over |\\text{Number of total claims in answer}|}
+    $$
+    The formula computes the fraction of incorrect claims to the total claims in the answer, using a dataset where
+    'answer', 'context', and 'ground_truth' columns are specified.
+    #### Configuring Columns
+    This metric requires the following columns in your dataset:
+    - `contexts` (List[str]): A list of text contexts which are retrieved to generate
+    the answer.
+    - `answer` (str): The response generated by the model
+    - `ground_truth` (str): The "correct" answer to the question
+    If the above data is not in the appropriate column, you can specify different column
+    names for these fields using the parameters `contexts_column` and `answer_column`.
+    For example, if your dataset has this data stored in different columns, you can
+    pass the following parameters:
+    ```python
+    {
+        "contexts_column": "context_info"
+        "answer_column": "my_answer_col",
+    }
+    ```
+    If the data is stored as a dictionary in another column, specify the column and key
+    like this:
+    ```python
+    pred_col = dataset.prediction_column(model)
+    params = {
+        "contexts_column": f"{pred_col}.contexts",
+        "answer_column": f"{pred_col}.answer",
+    }
+    ```
+    For more complex situations, you can use a function to extract the data:
+    ```python
+    pred_col = dataset.prediction_column(model)
+    params = {
+        "contexts_column": lambda row: [row[pred_col]["context_message"]],
+        "answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
+    }
+    ### Signs of High Risk
+    - High noise sensitivity scores across multiple samples.
+    - Significant deviation between mean and median noise sensitivity scores.
+    - High standard deviation indicating inconsistency in the model's performance.
+    ### Strengths
+    - Provides a quantitative measure of how well the LLM handles noisy or irrelevant context.
+    - Easy integration and configuration using column parameters.
+    - Utilizes both histogram and box plot visualizations to analyze score distribution.
+    ### Limitations
+    - Requires accurate ground truth that aligns with the generated answers.
+    - Assumes the context provided is sufficiently granular to assess noise sensitivity.
+    - Primarily applicable to tasks like text QA, text generation, and text summarization where contextual relevance is
+    critical.
+    """
+    try:
+        from ragas import evaluate
+        from ragas.metrics import noise_sensitivity_relevant
+    except ImportError:
+        raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
+    warnings.filterwarnings(
+        "ignore",
+        category=FutureWarning,
+        message="promote has been superseded by promote_options='default'.",
+    )
+    required_columns = {
+        "answer": answer_column,
+        "contexts": contexts_column,
+        "ground_truth": ground_truth_column,
+    }
+    df = get_renamed_columns(dataset._df, required_columns)
+    result_df = evaluate(
+        Dataset.from_pandas(df),
+        metrics=[noise_sensitivity_relevant],
+        **get_ragas_config(),
+    ).to_pandas()
+    fig_histogram = px.histogram(
+        x=result_df["noise_sensitivity_relevant"].to_list(), nbins=10
+    )
+    fig_box = px.box(x=result_df["noise_sensitivity_relevant"].to_list())
+    return (
+        {
+            # "Scores (will not be uploaded to UI)": result_df[
+            #     ["contexts", "answer", "ground_truth", "noise_sensitivity_relevant"]
+            # ],
+            "Aggregate Scores": [
+                {
+                    "Mean Score": result_df["noise_sensitivity_relevant"].mean(),
+                    "Median Score": result_df["noise_sensitivity_relevant"].median(),
+                    "Max Score": result_df["noise_sensitivity_relevant"].max(),
+                    "Min Score": result_df["noise_sensitivity_relevant"].min(),
+                    "Standard Deviation": result_df["noise_sensitivity_relevant"].std(),
+                    "Count": result_df.shape[0],
+                }
+            ],
+        },
+        fig_histogram,
+        fig_box,
+    )

validmind/tests/model_validation/ragas/utils.py CHANGED Viewed

@@ -5,11 +5,17 @@
 import os
 from validmind.ai.utils import get_client_and_model
+from validmind.client_config import client_config
 EMBEDDINGS_MODEL = "text-embedding-3-small"
 def get_ragas_config():
+    if not client_config.can_generate_llm_test_descriptions():
+        raise ValueError(
+            "LLM based descriptions are not enabled in the current configuration."
+        )
     # import here since its an optional dependency
     try:
         from langchain_openai import ChatOpenAI, OpenAIEmbeddings

validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py CHANGED Viewed

@@ -15,29 +15,36 @@ class AdjustedMutualInformation(ClusterPerformance):
     Evaluates clustering model performance by measuring mutual information between true and predicted labels, adjusting
     for chance.
-    **1. Purpose**: The purpose of this metric (Adjusted Mutual Information) is to evaluate the performance of a
-    machine learning model, more specifically, a clustering model. It measures the mutual information between the true
-    labels and the ones predicted by the model, adjusting for chance.
+    ### Purpose
-    **2. Test Mechanism**: The Adjusted Mutual Information (AMI) uses sklearn's `adjusted_mutual_info_score` function.
-    This function calculates the mutual information between the true labels and the ones predicted while correcting for
-    the chance correlation expected due to random label assignments. This test requires the model, the training
-    dataset, and the test dataset as inputs.
+    The purpose of this metric (Adjusted Mutual Information) is to evaluate the performance of a machine learning
+    model, more specifically, a clustering model. It measures the mutual information between the true labels and the
+    ones predicted by the model, adjusting for chance.
+    ### Test Mechanism
+    The Adjusted Mutual Information (AMI) uses sklearn's `adjusted_mutual_info_score` function. This function
+    calculates the mutual information between the true labels and the ones predicted while correcting for the chance
+    correlation expected due to random label assignments. This test requires the model, the training dataset, and the
+    test dataset as inputs.
+    ### Signs of High Risk
-    **3. Signs of High Risk**:
     - Low Adjusted Mutual Information Score: This score ranges between 0 and 1. A low score (closer to 0) can indicate
     poor model performance as the predicted labels do not align well with the true labels.
-    - In case of high dimensional data, if the algorithm shows high scores, this could also be a potential risk as AMI
+    - In case of high-dimensional data, if the algorithm shows high scores, this could also be a potential risk as AMI
     may not perform reliably.
-    **4. Strengths**:
+    ### Strengths
     - The AMI metric takes into account the randomness of the predicted labels, which makes it more robust than the
     simple Mutual Information.
     - The scale of AMI is not dependent on the sizes of the clustering, allowing for comparability between different
     datasets or models.
     - Good for comparing the output of clustering algorithms where the number of clusters is not known a priori.
-    **5. Limitations**:
+    ### Limitations
     - Adjusted Mutual Information does not take into account the continuous nature of some data. As a result, it may
     not be the best choice for regression or other continuous types of tasks.
     - AMI has the drawback of being biased towards clusterings with a higher number of clusters.
@@ -47,7 +54,7 @@ class AdjustedMutualInformation(ClusterPerformance):
     """
     name = "adjusted_mutual_information"
-    required_inputs = ["model", "datasets"]
+    required_inputs = ["model", "dataset"]
     tasks = ["clustering"]
     tags = [
         "sklearn",

validmind/tests/model_validation/sklearn/AdjustedRandIndex.py CHANGED Viewed

@@ -15,38 +15,43 @@ class AdjustedRandIndex(ClusterPerformance):
     Measures the similarity between two data clusters using the Adjusted Rand Index (ARI) metric in clustering machine
     learning models.
-    **1. Purpose:**
+    ### Purpose
     The Adjusted Rand Index (ARI) metric is intended to measure the similarity between two data clusters. This metric
-    is specifically being used for clustering machine learning models to validly quantify how well the model is
-    clustering and producing data groups. It involves comparing the model's produced clusters against the actual (true)
-    clusters found in the dataset.
+    is specifically used for clustering machine learning models to quantify how well the model is clustering and
+    producing data groups. It involves comparing the model's produced clusters against the actual (true) clusters found
+    in the dataset.
+    ### Test Mechanism
+    The Adjusted Rand Index (ARI) is calculated using the `adjusted_rand_score` method from the `sklearn.metrics`
+    module in Python. The test requires inputs including the model itself and the model's training and test datasets.
+    The model's computed clusters and the true clusters are compared, and the similarities are measured to compute the
+    ARI.
-    **2. Test Mechanism:**
-    The Adjusted Rand Index (ARI) is calculated by using the `adjusted_rand_score` method from the sklearn metrics in
-    Python. The test requires inputs including the model itself and the model's training and test datasets. The model's
-    computed clusters and the true clusters are compared, and the similarities are measured to compute the ARI.
+    ### Signs of High Risk
-    **3. Signs of High Risk:**
-    - If the ARI is close to zero, it signifies that the model's cluster assignments are random and don't match the
+    - If the ARI is close to zero, it signifies that the model's cluster assignments are random and do not match the
     actual dataset clusters, indicating a high risk.
     - An ARI of less than zero indicates that the model's clustering performance is worse than random.
-    **4. Strengths:**
-    - ARI is normalized and it hence gives a consistent metric between -1 and +1, irrespective of raw cluster sizes or
+    ### Strengths
+    - ARI is normalized and provides a consistent metric between -1 and +1, irrespective of raw cluster sizes or
     dataset size variations.
-    - It doesn’t require a ground truth for computation which makes it ideal for unsupervised learning model
-    evaluations.
+    - It does not require a ground truth for computation, making it ideal for unsupervised learning model evaluations.
     - It penalizes for false positives and false negatives, providing a robust measure of clustering quality.
-    **5. Limitations:**
+    ### Limitations
     - In real-world situations, true clustering is often unknown, which can hinder the practical application of the ARI.
     - The ARI requires all individual data instances to be independent, which may not always hold true.
-    - It may be difficult to interpret the implications of an ARI score without a context or a benchmark, as it is
+    - It may be difficult to interpret the implications of an ARI score without context or a benchmark, as it is
     heavily dependent on the characteristics of the dataset used.
     """
     name = "adjusted_rand_index"
-    required_inputs = ["model", "datasets"]
+    required_inputs = ["model", "dataset"]
     tasks = ["clustering"]
     tags = [
         "sklearn",

validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

validmind 2.5.8py3-none-any.whl → 2.5.18py3-none-any.whl