PyPI - validmind - Versions diffs - 2.8.10__py3-none-any.whl → 2.8.20__py3-none-any.whl - Mend

validmind 2.8.10py3-none-any.whl → 2.8.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (189) hide show

validmind/tests/model_validation/ragas/ContextEntityRecall.py CHANGED Viewed

@@ -118,8 +118,10 @@ def ContextEntityRecall(
     score_column = "context_entity_recall"
-    fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
-    fig_box = px.box(x=result_df[score_column].to_list())
+    fig_histogram = px.histogram(
+        x=result_df[score_column].to_list(), nbins=10, title="Context Entity Recall"
+    )
+    fig_box = px.box(x=result_df[score_column].to_list(), title="Context Entity Recall")
     return (
         {
@@ -143,5 +145,5 @@ def ContextEntityRecall(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/ContextPrecision.py CHANGED Viewed

@@ -114,8 +114,10 @@ def ContextPrecision(
     score_column = "llm_context_precision_with_reference"
-    fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
-    fig_box = px.box(x=result_df[score_column].to_list())
+    fig_histogram = px.histogram(
+        x=result_df[score_column].to_list(), nbins=10, title="Context Precision"
+    )
+    fig_box = px.box(x=result_df[score_column].to_list(), title="Context Precision")
     return (
         {
@@ -135,5 +137,5 @@ def ContextPrecision(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py CHANGED Viewed

@@ -109,8 +109,10 @@ def ContextPrecisionWithoutReference(
     score_column = "llm_context_precision_without_reference"
-    fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
-    fig_box = px.box(x=result_df[score_column].to_list())
+    fig_histogram = px.histogram(
+        x=result_df[score_column].to_list(), nbins=10, title="Context Precision"
+    )
+    fig_box = px.box(x=result_df[score_column].to_list(), title="Context Precision")
     return (
         {
@@ -130,5 +132,5 @@ def ContextPrecisionWithoutReference(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/ContextRecall.py CHANGED Viewed

@@ -114,8 +114,10 @@ def ContextRecall(
     score_column = "context_recall"
-    fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
-    fig_box = px.box(x=result_df[score_column].to_list())
+    fig_histogram = px.histogram(
+        x=result_df[score_column].to_list(), nbins=10, title="Context Recall"
+    )
+    fig_box = px.box(x=result_df[score_column].to_list(), title="Context Recall")
     return (
         {
@@ -135,5 +137,5 @@ def ContextRecall(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/Faithfulness.py CHANGED Viewed

@@ -119,8 +119,10 @@ def Faithfulness(
     score_column = "faithfulness"
-    fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
-    fig_box = px.box(x=result_df[score_column].to_list())
+    fig_histogram = px.histogram(
+        x=result_df[score_column].to_list(), nbins=10, title="Faithfulness"
+    )
+    fig_box = px.box(x=result_df[score_column].to_list(), title="Faithfulness")
     return (
         {
@@ -140,5 +142,5 @@ def Faithfulness(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/NoiseSensitivity.py CHANGED Viewed

@@ -179,5 +179,5 @@ def NoiseSensitivity(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/ResponseRelevancy.py CHANGED Viewed

@@ -133,8 +133,10 @@ def ResponseRelevancy(
     score_column = "answer_relevancy"
-    fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
-    fig_box = px.box(x=result_df[score_column].to_list())
+    fig_histogram = px.histogram(
+        x=result_df[score_column].to_list(), nbins=10, title="Response Relevancy"
+    )
+    fig_box = px.box(x=result_df[score_column].to_list(), title="Response Relevancy")
     return (
         {
@@ -154,5 +156,5 @@ def ResponseRelevancy(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/ragas/SemanticSimilarity.py CHANGED Viewed

@@ -112,8 +112,10 @@ def SemanticSimilarity(
     score_column = "semantic_similarity"
-    fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
-    fig_box = px.box(x=result_df[score_column].to_list())
+    fig_histogram = px.histogram(
+        x=result_df[score_column].to_list(), nbins=10, title="Semantic Similarity"
+    )
+    fig_box = px.box(x=result_df[score_column].to_list(), title="Semantic Similarity")
     return (
         {
@@ -133,5 +135,5 @@ def SemanticSimilarity(
         },
         fig_histogram,
         fig_box,
-        RawData(evaluation_results=result_df),
+        RawData(evaluation_results=result_df, dataset=dataset.input_id),
     )

validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py CHANGED Viewed

@@ -4,7 +4,7 @@
 from sklearn.metrics import adjusted_mutual_info_score
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -52,11 +52,11 @@ def AdjustedMutualInformation(model: VMModel, dataset: VMDataset):
     - The interpretability of the score can be complex as it depends on the understanding of information theory
     concepts.
     """
-    return [
-        {
-            "Adjusted Mutual Information": adjusted_mutual_info_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
-            )
-        }
-    ]
+    ami_score = adjusted_mutual_info_score(
+        labels_true=dataset.y,
+        labels_pred=dataset.y_pred(model),
+    )
+    return [{"Adjusted Mutual Information": ami_score}], RawData(
+        ami_score=ami_score, model=model.input_id, dataset=dataset.input_id
+    )

validmind/tests/model_validation/sklearn/AdjustedRandIndex.py CHANGED Viewed

@@ -4,7 +4,7 @@
 from sklearn.metrics import adjusted_rand_score
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -49,11 +49,11 @@ def AdjustedRandIndex(model: VMModel, dataset: VMDataset):
     - It may be difficult to interpret the implications of an ARI score without context or a benchmark, as it is
     heavily dependent on the characteristics of the dataset used.
     """
-    return [
-        {
-            "Adjusted Rand Index": adjusted_rand_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
-            )
-        }
-    ]
+    ari = adjusted_rand_score(
+        labels_true=dataset.y,
+        labels_pred=dataset.y_pred(model),
+    )
+    return [{"Adjusted Rand Index": ari}], RawData(
+        ari_score=ari, model=model.input_id, dataset=dataset.input_id
+    )

validmind/tests/model_validation/sklearn/CalibrationCurve.py CHANGED Viewed

@@ -72,7 +72,10 @@ def CalibrationCurve(model: VMModel, dataset: VMDataset, n_bins: int = 10):
     # Create DataFrame for raw data
     raw_data = RawData(
-        mean_predicted_probability=prob_pred, observed_frequency=prob_true
+        mean_predicted_probability=prob_pred,
+        observed_frequency=prob_true,
+        model=model.input_id,
+        dataset=dataset.input_id,
     )
     # Create Plotly figure
@@ -114,4 +117,4 @@ def CalibrationCurve(model: VMModel, dataset: VMDataset, n_bins: int = 10):
         template="plotly_white",
     )
-    return raw_data, fig
+    return fig, raw_data

validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py CHANGED Viewed

@@ -2,17 +2,24 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from typing import Dict, List, Optional, Union
 import numpy as np
 import pandas as pd
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
-def find_optimal_threshold(y_true, y_prob, method="youden", target_recall=None):
+def find_optimal_threshold(
+    y_true: np.ndarray,
+    y_prob: np.ndarray,
+    method: str = "youden",
+    target_recall: Optional[float] = None,
+) -> Dict[str, Union[str, float]]:
     """
     Find the optimal classification threshold using various methods.
@@ -80,8 +87,11 @@ def find_optimal_threshold(y_true, y_prob, method="youden", target_recall=None):
 @tags("model_validation", "threshold_optimization", "classification_metrics")
 @tasks("classification")
 def ClassifierThresholdOptimization(
-    dataset: VMDataset, model: VMModel, methods=None, target_recall=None
-):
+    dataset: VMDataset,
+    model: VMModel,
+    methods: Optional[List[str]] = None,
+    target_recall: Optional[float] = None,
+) -> Dict[str, Union[pd.DataFrame, go.Figure]]:
     """
     Analyzes and visualizes different threshold optimization methods for binary classification models.
@@ -255,4 +265,17 @@ def ClassifierThresholdOptimization(
     # Create results table and sort by threshold descending
     table = pd.DataFrame(results).sort_values("threshold", ascending=False)
-    return fig, table
+    return (
+        fig,
+        table,
+        RawData(
+            fpr=fpr,
+            tpr=tpr,
+            precision=precision,
+            recall=recall,
+            thresholds_roc=thresholds_roc,
+            thresholds_pr=thresholds_pr,
+            model=model.input_id,
+            dataset=dataset.input_id,
+        ),
+    )

validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py CHANGED Viewed

@@ -84,4 +84,8 @@ def ClusterCosineSimilarity(model: VMModel, dataset: VMDataset):
     if not table:
         raise SkipTestError("No clusters found")
-    return table, RawData(cluster_centroids=cluster_centroids)
+    return table, RawData(
+        cluster_centroids=cluster_centroids,
+        model=model.input_id,
+        dataset=dataset.input_id,
+    )

validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py CHANGED Viewed

@@ -11,7 +11,7 @@ from sklearn.metrics import (
     v_measure_score,
 )
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
 HOMOGENEITY = """
@@ -115,53 +115,63 @@ def ClusterPerformanceMetrics(model: VMModel, dataset: VMDataset):
     - Does not consider aspects like computational efficiency of the model or its capability to handle high dimensional
     data.
     """
-    return [
+    y_true = dataset.y
+    y_pred = dataset.y_pred(model)
+    metrics = [
         {
             "Metric": "Homogeneity Score",
             "Description": HOMOGENEITY,
             "Value": homogeneity_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
+                labels_true=y_true,
+                labels_pred=y_pred,
             ),
         },
         {
             "Metric": "Completeness Score",
             "Description": COMPLETENESS,
             "Value": completeness_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
+                labels_true=y_true,
+                labels_pred=y_pred,
             ),
         },
         {
             "Metric": "V Measure",
             "Description": V_MEASURE,
             "Value": v_measure_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
+                labels_true=y_true,
+                labels_pred=y_pred,
             ),
         },
         {
             "Metric": "Adjusted Rand Index",
             "Description": ADJUSTED_RAND_INDEX,
             "Value": adjusted_rand_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
+                labels_true=y_true,
+                labels_pred=y_pred,
             ),
         },
         {
             "Metric": "Adjusted Mutual Information",
             "Description": ADJUSTED_MUTUAL_INFORMATION,
             "Value": adjusted_mutual_info_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
+                labels_true=y_true,
+                labels_pred=y_pred,
             ),
         },
         {
             "Metric": "Fowlkes-Mallows score",
             "Description": FOULKES_MALLOWS_SCORE,
             "Value": fowlkes_mallows_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
+                labels_true=y_true,
+                labels_pred=y_pred,
             ),
         },
     ]
+    return metrics, RawData(
+        true_labels=y_true,
+        predicted_labels=y_pred,
+        model=model.input_id,
+        dataset=dataset.input_id,
+    )

validmind/tests/model_validation/sklearn/CompletenessScore.py CHANGED Viewed

@@ -4,7 +4,7 @@
 from sklearn.metrics import completeness_score
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -47,11 +47,10 @@ def CompletenessScore(model: VMModel, dataset: VMDataset):
     - The Completeness Score only applies to clustering models; it cannot be used for other types of machine learning
     models.
     """
-    return [
-        {
-            "Completeness Score": completeness_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
-            )
-        }
-    ]
+    score = completeness_score(
+        labels_true=dataset.y,
+        labels_pred=dataset.y_pred(model),
+    )
+    return [{"Completeness Score": score}], RawData(
+        score=score, model=model.input_id, dataset=dataset.input_id
+    )

validmind/tests/model_validation/sklearn/ConfusionMatrix.py CHANGED Viewed

@@ -19,7 +19,11 @@ from validmind.vm_models import VMDataset, VMModel
     "visualization",
 )
 @tasks("classification", "text_classification")
-def ConfusionMatrix(dataset: VMDataset, model: VMModel):
+def ConfusionMatrix(
+    dataset: VMDataset,
+    model: VMModel,
+    threshold: float = 0.5,
+):
     """
     Evaluates and visually represents the classification ML model's predictive performance using a Confusion Matrix
     heatmap.
@@ -66,7 +70,17 @@ def ConfusionMatrix(dataset: VMDataset, model: VMModel):
     - Risks of misinterpretation exist because the matrix doesn't directly provide precision, recall, or F1-score data.
     These metrics have to be computed separately.
     """
-    y_pred = dataset.y_pred(model)
+    # Get predictions using threshold for binary classification if possible
+    if hasattr(model.model, "predict_proba"):
+        y_prob = dataset.y_prob(model)
+        # Handle both 1D and 2D probability arrays
+        if y_prob.ndim == 2:
+            y_pred = (y_prob[:, 1] > threshold).astype(int)
+        else:
+            y_pred = (y_prob > threshold).astype(int)
+    else:
+        y_pred = dataset.y_pred(model)
     y_true = dataset.y.astype(y_pred.dtype)
     labels = np.unique(y_true)
@@ -119,4 +133,9 @@ def ConfusionMatrix(dataset: VMDataset, model: VMModel):
         font=dict(size=14),
     )
-    return fig, RawData(confusion_matrix=cm)
+    return fig, RawData(
+        confusion_matrix=cm,
+        threshold=threshold,
+        dataset=dataset.input_id,
+        model=model.input_id,
+    )

validmind/tests/model_validation/sklearn/FeatureImportance.py CHANGED Viewed

@@ -5,7 +5,7 @@
 import pandas as pd
 from sklearn.inspection import permutation_importance
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -78,4 +78,8 @@ def FeatureImportance(dataset: VMDataset, model: VMModel, num_features: int = 3)
         else:
             result[f"Feature {i + 1}"] = None
-    return pd.DataFrame([result])
+    return pd.DataFrame([result]), RawData(
+        permutation_importance=pfi_values,
+        model=model.input_id,
+        dataset=dataset.input_id,
+    )

validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py CHANGED Viewed

@@ -4,7 +4,7 @@
 from sklearn import metrics
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -52,11 +52,14 @@ def FowlkesMallowsScore(dataset: VMDataset, model: VMModel):
     - It does not handle mismatching numbers of clusters between the true and predicted labels. As such, it may return
     misleading results if the predicted labels suggest a different number of clusters than what is in the true labels.
     """
-    return [
-        {
-            "Fowlkes-Mallows score": metrics.fowlkes_mallows_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
-            )
-        }
-    ]
+    fowlkes_mallows_score = metrics.fowlkes_mallows_score(
+        labels_true=dataset.y,
+        labels_pred=dataset.y_pred(model),
+    )
+    return [{"Fowlkes-Mallows score": fowlkes_mallows_score}], RawData(
+        labels_true=dataset.y,
+        labels_pred=dataset.y_pred(model),
+        model=model.input_id,
+        dataset=dataset.input_id,
+    )

validmind/tests/model_validation/sklearn/HomogeneityScore.py CHANGED Viewed

@@ -4,7 +4,7 @@
 from sklearn import metrics
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -50,11 +50,16 @@ def HomogeneityScore(dataset: VMDataset, model: VMModel):
     - The score does not address the actual number of clusters formed, or the evenness of cluster sizes. It only checks
     the homogeneity within the given clusters created by the model.
     """
-    return [
-        {
-            "Homogeneity Score": metrics.homogeneity_score(
-                labels_true=dataset.y,
-                labels_pred=dataset.y_pred(model),
-            )
-        }
-    ]
+    homogeneity_score = metrics.homogeneity_score(
+        labels_true=dataset.y,
+        labels_pred=dataset.y_pred(model),
+    )
+    raw_data = RawData(
+        y_true=dataset.y,
+        y_pred=dataset.y_pred(model),
+        model=model.input_id,
+        dataset=dataset.input_id,
+    )
+    return ([{"Homogeneity Score": homogeneity_score}], raw_data)

validmind/tests/model_validation/sklearn/HyperParametersTuning.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Dict, List, Union
 from sklearn.metrics import make_scorer, recall_score
 from sklearn.model_selection import GridSearchCV
-from validmind import tags, tasks
+from validmind import RawData, tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -162,4 +162,6 @@ def HyperParametersTuning(
             results.append(row_result)
-    return results
+    return results, RawData(
+        model=model.input_id, dataset=dataset.input_id, param_grid=param_grid
+    )

validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py CHANGED Viewed

@@ -124,4 +124,9 @@ def KMeansClustersOptimization(
     fig.update_layout(showlegend=False)
-    return fig, RawData(distortions=distortions, silhouette_avg=silhouette_avg)
+    return fig, RawData(
+        distortions=distortions,
+        silhouette_avg=silhouette_avg,
+        model=model.input_id,
+        dataset=dataset.input_id,
+    )

validmind/tests/model_validation/sklearn/MinimumAccuracy.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 from sklearn.metrics import accuracy_score
+from validmind import RawData
 from validmind.tests import tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -50,10 +51,14 @@ def MinimumAccuracy(dataset: VMDataset, model: VMModel, min_threshold: float = 0
     """
     accuracy = accuracy_score(dataset.y, dataset.y_pred(model))
-    return [
-        {
-            "Score": accuracy,
-            "Threshold": min_threshold,
-            "Pass/Fail": "Pass" if accuracy > min_threshold else "Fail",
-        }
-    ], accuracy > min_threshold
+    return (
+        [
+            {
+                "Score": accuracy,
+                "Threshold": min_threshold,
+                "Pass/Fail": "Pass" if accuracy > min_threshold else "Fail",
+            }
+        ],
+        accuracy > min_threshold,
+        RawData(model=model.input_id, dataset=dataset.input_id),
+    )

validmind/tests/model_validation/sklearn/MinimumF1Score.py CHANGED Viewed

@@ -5,6 +5,7 @@
 import numpy as np
 from sklearn.metrics import f1_score
+from validmind import RawData
 from validmind.tests import tags, tasks
 from validmind.vm_models import VMDataset, VMModel
@@ -58,10 +59,14 @@ def MinimumF1Score(dataset: VMDataset, model: VMModel, min_threshold: float = 0.
     else:
         score = f1_score(dataset.y, dataset.y_pred(model))
-    return [
-        {
-            "Score": score,
-            "Threshold": min_threshold,
-            "Pass/Fail": "Pass" if score > min_threshold else "Fail",
-        }
-    ], score > min_threshold
+    return (
+        [
+            {
+                "Score": score,
+                "Threshold": min_threshold,
+                "Pass/Fail": "Pass" if score > min_threshold else "Fail",
+            }
+        ],
+        score > min_threshold,
+        RawData(score=score, model=model.input_id, dataset=dataset.input_id),
+    )

validmind 2.8.10__py3-none-any.whl → 2.8.20__py3-none-any.whl

validmind 2.8.10py3-none-any.whl → 2.8.20py3-none-any.whl