PyPI - validmind - Versions diffs - 2.7.12__py3-none-any.whl → 2.8.12__py3-none-any.whl - Mend

validmind 2.7.12py3-none-any.whl → 2.8.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

validmind/tests/model_validation/embeddings/utils.py CHANGED Viewed

@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 import numpy as np
-import pandas as pd
 import plotly.express as px
 from sklearn.metrics.pairwise import cosine_similarity
@@ -18,14 +17,12 @@ def create_stability_analysis_result(
         original_embeddings, perturbed_embeddings
     ).diagonal()
-    # create a raw dataframe of the original, perturbed and similarity
-    raw_data = pd.DataFrame(
-        {
-            "original": original_embeddings,
-            "perturbed": perturbed_embeddings,
-            "similarity": similarities,
-        }
-    )
+    # Store raw data in a dictionary
+    raw_data = {
+        "original_embeddings": original_embeddings,
+        "perturbed_embeddings": perturbed_embeddings,
+        "similarities": similarities,
+    }
     mean = np.mean(similarities)
     passed = mean > mean_similarity_threshold

validmind/tests/model_validation/ragas/AnswerCorrectness.py CHANGED Viewed

@@ -144,5 +144,5 @@ def AnswerCorrectness(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/AspectCritic.py CHANGED Viewed

@@ -195,5 +195,8 @@ def AspectCritic(
             ]
         },
         fig,
-        RawData(evaluation_results=result_df),
+        RawData(
+            evaluation_results=result_df,
+            dataset=dataset.input_id,
+        ),
     )

validmind/tests/model_validation/ragas/ContextEntityRecall.py CHANGED Viewed

@@ -143,5 +143,5 @@ def ContextEntityRecall(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/ContextPrecision.py CHANGED Viewed

@@ -135,5 +135,5 @@ def ContextPrecision(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py CHANGED Viewed

@@ -130,5 +130,5 @@ def ContextPrecisionWithoutReference(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/ContextRecall.py CHANGED Viewed

@@ -135,5 +135,5 @@ def ContextRecall(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/Faithfulness.py CHANGED Viewed

@@ -140,5 +140,5 @@ def Faithfulness(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/NoiseSensitivity.py CHANGED Viewed

@@ -179,5 +179,5 @@ def NoiseSensitivity(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/ResponseRelevancy.py CHANGED Viewed

@@ -154,5 +154,5 @@ def ResponseRelevancy(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/SemanticSimilarity.py CHANGED Viewed

@@ -133,5 +133,5 @@ def SemanticSimilarity(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/utils.py CHANGED Viewed

@@ -4,24 +4,25 @@
 import os
-from validmind.ai.utils import get_client_and_model
-from validmind.client_config import client_config
+from validmind.ai.utils import get_client_and_model, is_configured
 EMBEDDINGS_MODEL = "text-embedding-3-small"
 def get_ragas_config():
-    if not client_config.can_generate_llm_test_descriptions():
-        raise ValueError(
-            "LLM based descriptions are not enabled in the current configuration."
-        )
     # import here since its an optional dependency
     try:
         from langchain_openai import ChatOpenAI, OpenAIEmbeddings
     except ImportError:
         raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
+    if not is_configured():
+        raise ValueError(
+            "LLM is not configured. Please set an `OPENAI_API_KEY` environment variable "
+            "or ensure that you are connected to the ValidMind API and ValidMind AI is "
+            "enabled for your account."
+        )
     client, model = get_client_and_model()
     os.environ["OPENAI_API_BASE"] = str(client.base_url)

validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py CHANGED Viewed

@@ -4,7 +4,7 @@
 from sklearn.metrics import adjusted_mutual_info_score
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -52,11 +52,11 @@ def AdjustedMutualInformation(model: VMModel, dataset: VMDataset):
     - The interpretability of the score can be complex as it depends on the understanding of information theory
     concepts.
     """
-    return [
-        {
-            "Adjusted Mutual Information": adjusted_mutual_info_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
-            )
-        }
-    ]
+    ami_score = adjusted_mutual_info_score(
+        labels_true=dataset.y,
+        labels_pred=dataset.y_pred(model),
+    )
+    return [{"Adjusted Mutual Information": ami_score}], RawData(
+        ami_score=ami_score, model=model.input_id, dataset=dataset.input_id
+    )

validmind/tests/model_validation/sklearn/AdjustedRandIndex.py CHANGED Viewed

@@ -4,7 +4,7 @@
 from sklearn.metrics import adjusted_rand_score
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -49,11 +49,11 @@ def AdjustedRandIndex(model: VMModel, dataset: VMDataset):
     - It may be difficult to interpret the implications of an ARI score without context or a benchmark, as it is
     heavily dependent on the characteristics of the dataset used.
     """
-    return [
-        {
-            "Adjusted Rand Index": adjusted_rand_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
-            )
-        }
-    ]
+    ari = adjusted_rand_score(
+        labels_true=dataset.y,
+        labels_pred=dataset.y_pred(model),
+    )
+    return [{"Adjusted Rand Index": ari}], RawData(
+        ari_score=ari, model=model.input_id, dataset=dataset.input_id
+    )

validmind/tests/model_validation/sklearn/CalibrationCurve.py CHANGED Viewed

@@ -72,7 +72,10 @@ def CalibrationCurve(model: VMModel, dataset: VMDataset, n_bins: int = 10):
     # Create DataFrame for raw data
     raw_data = RawData(
-        mean_predicted_probability=prob_pred, observed_frequency=prob_true
+        mean_predicted_probability=prob_pred,
+        observed_frequency=prob_true,
+        model=model.input_id,
+        dataset=dataset.input_id,
     )
     # Create Plotly figure
@@ -114,4 +117,4 @@ def CalibrationCurve(model: VMModel, dataset: VMDataset, n_bins: int = 10):
         template="plotly_white",
     )
-    return raw_data, fig
+    return fig, raw_data

validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py CHANGED Viewed

@@ -8,7 +8,7 @@ import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -255,4 +255,17 @@ def ClassifierThresholdOptimization(
     # Create results table and sort by threshold descending
     table = pd.DataFrame(results).sort_values("threshold", ascending=False)
-    return fig, table
+    return (
+        fig,
+        table,
+        RawData(
+            fpr=fpr,
+            tpr=tpr,
+            precision=precision,
+            recall=recall,
+            thresholds_roc=thresholds_roc,
+            thresholds_pr=thresholds_pr,
+            model=model.input_id,
+            dataset=dataset.input_id,
+        ),
+    )

validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py CHANGED Viewed

@@ -84,4 +84,8 @@ def ClusterCosineSimilarity(model: VMModel, dataset: VMDataset):
     if not table:
         raise SkipTestError("No clusters found")
-    return table, RawData(cluster_centroids=cluster_centroids)
+    return table, RawData(
+        cluster_centroids=cluster_centroids,
+        model=model.input_id,
+        dataset=dataset.input_id,
+    )

validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py CHANGED Viewed

@@ -11,7 +11,7 @@ from sklearn.metrics import (
     v_measure_score,
 )
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
 HOMOGENEITY = """
@@ -115,53 +115,63 @@ def ClusterPerformanceMetrics(model: VMModel, dataset: VMDataset):
     - Does not consider aspects like computational efficiency of the model or its capability to handle high dimensional
     data.
     """
-    return [
+    y_true = dataset.y
+    y_pred = dataset.y_pred(model)
+    metrics = [
         {
             "Metric": "Homogeneity Score",
             "Description": HOMOGENEITY,
             "Value": homogeneity_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
+                labels_true=y_true,
+                labels_pred=y_pred,
             ),
         },
         {
             "Metric": "Completeness Score",
             "Description": COMPLETENESS,
             "Value": completeness_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
+                labels_true=y_true,
+                labels_pred=y_pred,
             ),
         },
         {
             "Metric": "V Measure",
             "Description": V_MEASURE,
             "Value": v_measure_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
+                labels_true=y_true,
+                labels_pred=y_pred,
             ),
         },
         {
             "Metric": "Adjusted Rand Index",
             "Description": ADJUSTED_RAND_INDEX,
             "Value": adjusted_rand_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
+                labels_true=y_true,
+                labels_pred=y_pred,
             ),
         },
         {
             "Metric": "Adjusted Mutual Information",
             "Description": ADJUSTED_MUTUAL_INFORMATION,
             "Value": adjusted_mutual_info_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
+                labels_true=y_true,
+                labels_pred=y_pred,
             ),
         },
         {
             "Metric": "Fowlkes-Mallows score",
             "Description": FOULKES_MALLOWS_SCORE,
             "Value": fowlkes_mallows_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
+                labels_true=y_true,
+                labels_pred=y_pred,
             ),
         },
     ]
+    return metrics, RawData(
+        true_labels=y_true,
+        predicted_labels=y_pred,
+        model=model.input_id,
+        dataset=dataset.input_id,
+    )

validmind/tests/model_validation/sklearn/CompletenessScore.py CHANGED Viewed

@@ -4,7 +4,7 @@
 from sklearn.metrics import completeness_score
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -47,11 +47,10 @@ def CompletenessScore(model: VMModel, dataset: VMDataset):
     - The Completeness Score only applies to clustering models; it cannot be used for other types of machine learning
     models.
     """
-    return [
-        {
-            "Completeness Score": completeness_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
-            )
-        }
-    ]
+    score = completeness_score(
+        labels_true=dataset.y,
+        labels_pred=dataset.y_pred(model),
+    )
+    return [{"Completeness Score": score}], RawData(
+        score=score, model=model.input_id, dataset=dataset.input_id
+    )

validmind/tests/model_validation/sklearn/ConfusionMatrix.py CHANGED Viewed

@@ -19,7 +19,11 @@ from validmind.vm_models import VMDataset, VMModel
     "visualization",
 )
 @tasks("classification", "text_classification")
-def ConfusionMatrix(dataset: VMDataset, model: VMModel):
+def ConfusionMatrix(
+    dataset: VMDataset,
+    model: VMModel,
+    threshold: float = 0.5,
+):
     """
     Evaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix
     heatmap.
@@ -66,7 +70,17 @@ def ConfusionMatrix(dataset: VMDataset, model: VMModel):
     - Risks of misinterpretation exist because the matrix doesn't directly provide precision, recall, or F1-score data.
     These metrics have to be computed separately.
     """
-    y_pred = dataset.y_pred(model)
+    # Get predictions using threshold for binary classification if possible
+    if hasattr(model.model, "predict_proba"):
+        y_prob = dataset.y_prob(model)
+        # Handle both 1D and 2D probability arrays
+        if y_prob.ndim == 2:
+            y_pred = (y_prob[:, 1] > threshold).astype(int)
+        else:
+            y_pred = (y_prob > threshold).astype(int)
+    else:
+        y_pred = dataset.y_pred(model)
     y_true = dataset.y.astype(y_pred.dtype)
     labels = np.unique(y_true)
@@ -119,4 +133,9 @@ def ConfusionMatrix(dataset: VMDataset, model: VMModel):
         font=dict(size=14),
     )
-    return fig, RawData(confusion_matrix=cm)
+    return fig, RawData(
+        confusion_matrix=cm,
+        threshold=threshold,
+        dataset=dataset.input_id,
+        model=model.input_id,
+    )

validmind/tests/model_validation/sklearn/FeatureImportance.py CHANGED Viewed

@@ -5,7 +5,7 @@
 import pandas as pd
 from sklearn.inspection import permutation_importance
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -78,4 +78,8 @@ def FeatureImportance(dataset: VMDataset, model: VMModel, num_features: int = 3)
         else:
             result[f"Feature {i + 1}"] = None
-    return pd.DataFrame([result])
+    return pd.DataFrame([result]), RawData(
+        permutation_importance=pfi_values,
+        model=model.input_id,
+        dataset=dataset.input_id,
+    )

validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py CHANGED Viewed

@@ -4,7 +4,7 @@
 from sklearn import metrics
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -52,11 +52,14 @@ def FowlkesMallowsScore(dataset: VMDataset, model: VMModel):
     - It does not handle mismatching numbers of clusters between the true and predicted labels. As such, it may return
     misleading results if the predicted labels suggest a different number of clusters than what is in the true labels.
     """
-    return [
-        {
-            "Fowlkes-Mallows score": metrics.fowlkes_mallows_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
-            )
-        }
-    ]
+    fowlkes_mallows_score = metrics.fowlkes_mallows_score(
+        labels_true=dataset.y,
+        labels_pred=dataset.y_pred(model),
+    )
+    return [{"Fowlkes-Mallows score": fowlkes_mallows_score}], RawData(
+        labels_true=dataset.y,
+        labels_pred=dataset.y_pred(model),
+        model=model.input_id,
+        dataset=dataset.input_id,
+    )

validmind/tests/model_validation/sklearn/HomogeneityScore.py CHANGED Viewed

@@ -4,7 +4,7 @@
 from sklearn import metrics
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -50,11 +50,16 @@ def HomogeneityScore(dataset: VMDataset, model: VMModel):
     - The score does not address the actual number of clusters formed, or the evenness of cluster sizes. It only checks
     the homogeneity within the given clusters created by the model.
     """
-    return [
-        {
-            "Homogeneity Score": metrics.homogeneity_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
-            )
-        }
-    ]
+    homogeneity_score = metrics.homogeneity_score(
+        labels_true=dataset.y,
+        labels_pred=dataset.y_pred(model),
+    )
+    raw_data = RawData(
+        y_true=dataset.y,
+        y_pred=dataset.y_pred(model),
+        model=model.input_id,
+        dataset=dataset.input_id,
+    )
+    return ([{"Homogeneity Score": homogeneity_score}], raw_data)

validmind/tests/model_validation/sklearn/HyperParametersTuning.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Dict, List, Union
 from sklearn.metrics import make_scorer, recall_score
 from sklearn.model_selection import GridSearchCV
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -162,4 +162,6 @@ def HyperParametersTuning(
             results.append(row_result)
-    return results
+    return results, RawData(
+        model=model.input_id, dataset=dataset.input_id, param_grid=param_grid
+    )

validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py CHANGED Viewed

@@ -124,4 +124,9 @@ def KMeansClustersOptimization(
     fig.update_layout(showlegend=False)
-    return fig, RawData(distortions=distortions, silhouette_avg=silhouette_avg)
+    return fig, RawData(
+        distortions=distortions,
+        silhouette_avg=silhouette_avg,
+        model=model.input_id,
+        dataset=dataset.input_id,
+    )

validmind/tests/model_validation/sklearn/MinimumAccuracy.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 from sklearn.metrics import accuracy_score
+from validmind import RawData
 from validmind.tests import tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -50,10 +51,14 @@ def MinimumAccuracy(dataset: VMDataset, model: VMModel, min_threshold: float = 0
     """
     accuracy = accuracy_score(dataset.y, dataset.y_pred(model))
-    return [
-        {
-            "Score": accuracy,
-            "Threshold": min_threshold,
-            "Pass/Fail": "Pass" if accuracy > min_threshold else "Fail",
-        }
-    ], accuracy > min_threshold
+    return (
+        [
+            {
+                "Score": accuracy,
+                "Threshold": min_threshold,
+                "Pass/Fail": "Pass" if accuracy > min_threshold else "Fail",
+            }
+        ],
+        accuracy > min_threshold,
+        RawData(model=model.input_id, dataset=dataset.input_id),
+    )

validmind/tests/model_validation/sklearn/MinimumF1Score.py CHANGED Viewed

@@ -5,6 +5,7 @@
 import numpy as np
 from sklearn.metrics import f1_score
+from validmind import RawData
 from validmind.tests import tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -58,10 +59,14 @@ def MinimumF1Score(dataset: VMDataset, model: VMModel, min_threshold: float = 0.
     else:
         score = f1_score(dataset.y, dataset.y_pred(model))
-    return [
-        {
-            "Score": score,
-            "Threshold": min_threshold,
-            "Pass/Fail": "Pass" if score > min_threshold else "Fail",
-        }
-    ], score > min_threshold
+    return (
+        [
+            {
+                "Score": score,
+                "Threshold": min_threshold,
+                "Pass/Fail": "Pass" if score > min_threshold else "Fail",
+            }
+        ],
+        score > min_threshold,
+        RawData(score=score, model=model.input_id, dataset=dataset.input_id),
+    )

validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py CHANGED Viewed

@@ -6,7 +6,7 @@ import numpy as np
 from sklearn.metrics import roc_auc_score
 from sklearn.preprocessing import LabelBinarizer
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -62,12 +62,12 @@ def MinimumROCAUCScore(dataset: VMDataset, model: VMModel, min_threshold: float
         lb = LabelBinarizer()
         lb.fit(y_true)
-        y_true_binarized = lb.transform(y_true)
-        y_score_binarized = lb.transform(dataset.y_pred(model))
+        y_true_binary = lb.transform(y_true)
+        y_score_binary = lb.transform(dataset.y_pred(model))
         roc_auc = roc_auc_score(
-            y_true=y_true_binarized,
-            y_score=y_score_binarized,
+            y_true=y_true_binary,
+            y_score=y_score_binary,
             average="macro",
         )
@@ -75,10 +75,21 @@ def MinimumROCAUCScore(dataset: VMDataset, model: VMModel, min_threshold: float
         y_score_prob = dataset.y_prob(model)
         roc_auc = roc_auc_score(y_true=y_true, y_score=y_score_prob)
-    return [
+    results = [
         {
             "Score": roc_auc,
             "Threshold": min_threshold,
             "Pass/Fail": "Pass" if roc_auc > min_threshold else "Fail",
         }
-    ], roc_auc > min_threshold
+    ]
+    return (
+        results,
+        roc_auc > min_threshold,
+        RawData(
+            y_true=y_true,
+            roc_auc=roc_auc,
+            model=model.input_id,
+            dataset=dataset.input_id,
+        ),
+    )

validmind 2.7.12__py3-none-any.whl → 2.8.12__py3-none-any.whl

validmind 2.7.12py3-none-any.whl → 2.8.12py3-none-any.whl