PyPI - validmind - Versions diffs - 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl - Mend

validmind 2.5.8py3-none-any.whl → 2.5.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (233) hide show

validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py ADDED Viewed

@@ -0,0 +1,172 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import json
+import sys
+import matplotlib.pyplot as plt
+import pandas as pd
+from fairlearn.metrics import (
+    MetricFrame,
+    count,
+    demographic_parity_ratio,
+    equalized_odds_ratio,
+    false_negative_rate,
+    false_positive_rate,
+    true_positive_rate,
+)
+from fairlearn.postprocessing import ThresholdOptimizer, plot_threshold_optimizer
+from validmind import tags, tasks
+from validmind.logging import get_logger
+logger = get_logger(__name__)
+@tags("bias_and_fairness")
+@tasks("classification", "regression")
+def ProtectedClassesThresholdOptimizer(
+    dataset, pipeline=None, protected_classes=None, X_train=None, y_train=None
+):
+    """
+    Obtains a classifier by applying group-specific thresholds to the provided estimator.
+    ### Purpose
+    This test aims to optimize the fairness of a machine learning model by applying different
+    classification thresholds for different protected groups. It helps in mitigating bias and
+    achieving more equitable outcomes across different demographic groups.
+    ### Test Mechanism
+    The test uses Fairlearn's ThresholdOptimizer to:
+    1. Fit an optimizer on the training data, considering protected classes.
+    2. Apply optimized thresholds to make predictions on the test data.
+    3. Calculate and report various fairness metrics.
+    4. Visualize the optimized thresholds.
+    ### Signs of High Risk
+    - Large disparities in fairness metrics (e.g., Demographic Parity Ratio, Equalized Odds Ratio)
+      across different protected groups.
+    - Significant differences in False Positive Rates (FPR) or True Positive Rates (TPR) between groups.
+    - Thresholds that vary widely across different protected groups.
+    ### Strengths
+    - Provides a post-processing method to improve model fairness without modifying the original model.
+    - Allows for balancing multiple fairness criteria simultaneously.
+    - Offers visual insights into the threshold optimization process.
+    ### Limitations
+    - May lead to a decrease in overall model performance while improving fairness.
+    - Requires access to protected attribute information at prediction time.
+    - The effectiveness can vary depending on the chosen fairness constraint and objective.
+    """
+    if sys.version_info < (3, 9):
+        raise RuntimeError("This test requires Python 3.9 or higher.")
+    if (
+        pipeline is None
+        or protected_classes is None
+        or X_train is None
+        or y_train is None
+    ):
+        logger.warning(
+            "Missing required parameters. Please provide pipeline, protected_classes, X_train, and y_train."
+        )
+        return pd.DataFrame()
+    test_df = dataset.df
+    threshold_optimizer = initialize_and_fit_optimizer(
+        pipeline, X_train, y_train, X_train[protected_classes]
+    )
+    fig = plot_thresholds(threshold_optimizer)
+    target = dataset.target_column
+    y_pred_opt = make_predictions(threshold_optimizer, test_df, protected_classes)
+    fairness_metrics = calculate_fairness_metrics(
+        test_df, target, y_pred_opt, protected_classes
+    )
+    return (
+        {"DPR and EOR Table": fairness_metrics.reset_index()},
+        fig,
+    )
+def initialize_and_fit_optimizer(pipeline, X_train, y_train, protected_classes_df):
+    threshold_optimizer = ThresholdOptimizer(
+        estimator=pipeline,
+        objective="balanced_accuracy_score",
+        constraints="demographic_parity",
+        predict_method="predict_proba",
+        prefit=False,
+    )
+    threshold_optimizer.fit(X_train, y_train, sensitive_features=protected_classes_df)
+    return threshold_optimizer
+def plot_thresholds(threshold_optimizer):
+    fig = plt.figure()
+    plot_threshold_optimizer(threshold_optimizer, show_plot=False)
+    return fig
+def make_predictions(threshold_optimizer, test_df, protected_classes):
+    y_pred_opt = threshold_optimizer.predict(
+        test_df, sensitive_features=test_df[protected_classes]
+    )
+    return y_pred_opt
+def calculate_fairness_metrics(test_df, target, y_pred_opt, protected_classes):
+    fairness_metrics = pd.DataFrame(
+        columns=protected_classes,
+        index=["demographic parity ratio", "equal odds ratio"],
+    )
+    for feature in protected_classes:
+        dpr = demographic_parity_ratio(
+            y_true=test_df[target],
+            y_pred=y_pred_opt,
+            sensitive_features=test_df[[feature]],
+        )
+        eor = equalized_odds_ratio(
+            y_true=test_df[target],
+            y_pred=y_pred_opt,
+            sensitive_features=test_df[[feature]],
+        )
+        fairness_metrics[feature] = [round(dpr, 2), round(eor, 2)]
+    return fairness_metrics
+def calculate_group_metrics(test_df, target, y_pred_opt, protected_classes):
+    metrics = {
+        "fpr": false_positive_rate,
+        "tpr": true_positive_rate,
+        "fnr": false_negative_rate,
+        "count": count,
+    }
+    mf = MetricFrame(
+        metrics=metrics,
+        y_true=test_df[target],
+        y_pred=y_pred_opt,
+        sensitive_features=test_df[protected_classes],
+    )
+    group_metrics = mf.by_group
+    return group_metrics
+def get_thresholds_by_group(threshold_optimizer):
+    threshold_rules = threshold_optimizer.interpolated_thresholder_.interpolation_dict
+    thresholds = json.dumps(threshold_rules, default=str, indent=4)
+    thresholds_df = pd.DataFrame.from_records(json.loads(thresholds))
+    return thresholds_df

validmind/tests/data_validation/RollingStatsPlot.py CHANGED Viewed

@@ -10,41 +10,49 @@ from validmind.vm_models import Figure, Metric
 class RollingStatsPlot(Metric):
     """
-    This test evaluates the stationarity of time series data by plotting its rolling mean and standard deviation.
-    **Purpose**: The `RollingStatsPlot` metric is employed to gauge the stationarity of time series data in a given
-    dataset. This metric specifically evaluates the rolling mean and rolling standard deviation of the dataset over a
-    pre-specified window size. The rolling mean provides an understanding of the average trend in the data, while the
-    rolling standard deviation gauges the volatility of the data within the window. It is critical in preparing time
-    series data for modeling as it reveals key insights into data behavior across time.
-    **Test Mechanism**: This mechanism is comprised of two steps. Initially, the rolling mean and standard deviation
-    for each of the dataset's columns are calculated over a window size, which can be user-specified or by default set
-    to 12 data points. Then, the calculated rolling mean and standard deviation are visualized via separate plots,
-    illustrating the trends and volatility in the dataset. A straightforward check is conducted to ensure the existence
-    of columns in the dataset, and to verify that the given dataset has been indexed by its date and time—a necessary
-    prerequisites for time series analysis.
-    **Signs of High Risk**:
+    Evaluates the stationarity of time series data by plotting its rolling mean and standard deviation over a specified
+    window.
+    ### Purpose
+    The `RollingStatsPlot` metric is employed to gauge the stationarity of time series data in a given dataset. This
+    metric specifically evaluates the rolling mean and rolling standard deviation of the dataset over a pre-specified
+    window size. The rolling mean provides an understanding of the average trend in the data, while the rolling
+    standard deviation gauges the volatility of the data within the window. It is critical in preparing time series
+    data for modeling as it reveals key insights into data behavior across time.
+    ### Test Mechanism
+    This mechanism is comprised of two steps. Initially, the rolling mean and standard deviation for each of the
+    dataset's columns are calculated over a window size, which can be user-specified or by default set to 12 data
+    points. Then, the calculated rolling mean and standard deviation are visualized via separate plots, illustrating
+    the trends and volatility in the dataset. A straightforward check is conducted to ensure the existence of columns
+    in the dataset, and to verify that the given dataset has been indexed by its date and time—a necessary prerequisite
+    for time series analysis.
+    ### Signs of High Risk
     - The presence of non-stationary patterns in either the rolling mean or the rolling standard deviation plots, which
     could indicate trends or seasonality in the data that may affect the performance of time series models.
     - Missing columns in the dataset, which would prevent the execution of this metric correctly.
     - The detection of NaN values in the dataset, which may need to be addressed before the metric can proceed
     successfully.
-    **Strengths**:
-    - Offers visualizations of trending behaviour and volatility within the data, facilitating a broader understanding
+    ### Strengths
+    - Offers visualizations of trending behavior and volatility within the data, facilitating a broader understanding
     of the dataset's inherent characteristics.
-    - Checks of the dataset's integrity, such as existence of all required columns and the availability of a datetime
-    index.
+    - Checks of the dataset's integrity, such as the existence of all required columns and the availability of a
+    datetime index.
     - Adjusts to accommodate various window sizes, thus allowing accurate analysis of data with differing temporal
     granularities.
     - Considers each column of the data individually, thereby accommodating multi-feature datasets.
-    **Limitations**:
-    - For all columns, a fixed-size window is utilised. This may not accurately capture patterns in datasets where
+    ### Limitations
+    - For all columns, a fixed-size window is utilized. This may not accurately capture patterns in datasets where
     different features may require different optimal window sizes.
-    - Requires the dataset to be indexed by date and time, hence it may not be useable for datasets without a timestamp
+    - Requires the dataset to be indexed by date and time, hence it may not be usable for datasets without a timestamp
     index.
     - Primarily serves for data visualization as it does not facilitate any quantitative measures for stationarity,
     such as through statistical tests. Therefore, the interpretation is subjective and depends heavily on modeler

validmind/tests/data_validation/RunsTest.py ADDED Viewed

@@ -0,0 +1,72 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import pandas as pd
+from statsmodels.sandbox.stats.runs import runstest_1samp
+from validmind import tags, tasks
+@tasks("classification", "regression")
+@tags("tabular_data", "statistical_test", "statsmodels")
+def RunsTest(dataset):
+    """
+    Executes Runs Test on ML model to detect non-random patterns in output data sequence.
+    ### Purpose
+    The Runs Test is a statistical procedure used to determine whether the sequence of data extracted from the ML model
+    behaves randomly or not. Specifically, it analyzes runs, sequences of consecutive positives or negatives, in the
+    data to check if there are more or fewer runs than expected under the assumption of randomness. This can be an
+    indication of some pattern, trend, or cycle in the model's output which may need attention.
+    ### Test Mechanism
+    The testing mechanism applies the Runs Test from the statsmodels module on each column of the training dataset. For
+    every feature in the dataset, a Runs Test is executed, whose output includes a Runs Statistic and P-value. A low
+    P-value suggests that data arrangement in the feature is not likely to be random. The results are stored in a
+    dictionary where the keys are the feature names, and the values are another dictionary storing the test statistic
+    and the P-value for each feature.
+    ### Signs of High Risk
+    - High risk is indicated when the P-value is close to zero.
+    - If the P-value is less than a predefined significance level (like 0.05), it suggests that the runs (series of
+    positive or negative values) in the model's output are not random and are longer or shorter than what is expected
+    under a random scenario.
+    - This would mean there's a high risk of non-random distribution of errors or model outcomes, suggesting potential
+    issues with the model.
+    ### Strengths
+    - Straightforward and fast for detecting non-random patterns in data sequence.
+    - Validates assumptions of randomness, which is valuable for checking error distributions in regression models,
+    trendless time series data, and ensuring a classifier doesn't favor one class over another.
+    - Can be applied to both classification and regression tasks, making it versatile.
+    ### Limitations
+    - Assumes that the data is independently and identically distributed (i.i.d.), which might not be the case for many
+    real-world datasets.
+    - The conclusion drawn from the low P-value indicating non-randomness does not provide information about the type
+    or the source of the detected pattern.
+    - Sensitive to extreme values (outliers), and overly large or small run sequences can influence the results.
+    - Does not provide model performance evaluation; it is used to detect patterns in the sequence of outputs only.
+    """
+    df = dataset.df[dataset.feature_columns_numeric]
+    runs_test_values = {}
+    for col in df.columns:
+        runs_stat, runs_p_value = runstest_1samp(df[col].values)
+        runs_test_values[col] = {
+            "stat": runs_stat,
+            "pvalue": runs_p_value,
+        }
+    runs_test_df = pd.DataFrame.from_dict(runs_test_values, orient="index")
+    runs_test_df.reset_index(inplace=True)
+    runs_test_df.columns = ["feature", "stat", "pvalue"]
+    return runs_test_df

validmind/tests/data_validation/ScatterPlot.py CHANGED Viewed

@@ -5,86 +5,71 @@
 import matplotlib.pyplot as plt
 import seaborn as sns
-from validmind.vm_models import Figure, Metric
+from validmind import tags, tasks
-class ScatterPlot(Metric):
+@tags("tabular_data", "visualization")
+@tasks("classification", "regression")
+def ScatterPlot(dataset):
     """
-    Creates a scatter plot matrix to visually analyze feature relationships, patterns, and outliers in a dataset.
-    **Purpose**: The ScatterPlot metric is designed to offer a visual analysis of a given dataset by constructing a
-    scatter plot matrix encapsulating all the dataset's features (or columns). Its primary function lies in unearthing
-    relationships, patterns, or outliers across different features, thus providing both quantitative and qualitative
-    insights into the multidimensional relationships within the dataset. This visual assessment aids in understanding
-    the efficacy of the chosen features for model training and their overall suitability.
-    **Test Mechanism**: Using the seaborn library, the ScatterPlot class creates the scatter plot matrix. The process
-    includes retrieving all columns from the dataset, verifying their existence, and subsequently generating a pairplot
-    for these columns. A kernel density estimate (kde) is utilized to present a smoother, univariate distribution along
-    the grid's diagonal. The final plot is housed in an array of Figure objects, each wrapping a matplotlib figure
-    instance for storage and future usage.
-    **Signs of High Risk**:
-    - The emergence of non-linear or random patterns across different feature pairs. This may suggest intricate
-    relationships unfit for linear presumptions.
-    - A lack of clear patterns or clusters which might point to weak or non-existent correlations among features, thus
-    creating a problem for certain model types.
-    - The occurrence of outliers as visual outliers in your data can adversely influence the model's performance.
-    **Strengths**:
-    - It offers insight into the multidimensional relationships among multiple features.
-    - It assists in identifying trends, correlations, and outliers which could potentially affect the model's
-    performance.
-    - As a diagnostic tool, it can validate whether certain assumptions made during the model-creation process, such as
-    linearity, hold true.
-    - The tool's versatility extends to its application for both regression and classification tasks.
-    **Limitations**:
-    - Scatter plot matrices may become cluttered and hard to decipher as the number of features escalates, resulting in
-    complexity and confusion.
-    - While extremely proficient in revealing pairwise relationships, these matrices may fail to illuminate complex
-    interactions that involve three or more features.
-    - These matrices are primarily visual tools, so the precision of quantitative analysis may be compromised.
-    - If not clearly visible, outliers can be missed, which could negatively affect model performance.
-    - It assumes that the dataset can fit into the computer's memory, which might not always be valid particularly for
-    extremely large datasets.
+    Assesses visual relationships, patterns, and outliers among features in a dataset through scatter plot matrices.
+    ### Purpose
+    The ScatterPlot test aims to visually analyze a given dataset by constructing a scatter plot matrix of its
+    numerical features. The primary goal is to uncover relationships, patterns, and outliers across different features
+    to provide both quantitative and qualitative insights into multidimensional relationships within the dataset. This
+    visual assessment aids in understanding the efficacy of the chosen features for model training and their
+    suitability.
+    ### Test Mechanism
+    Using the Seaborn library, the ScatterPlot function creates the scatter plot matrix. The process involves
+    retrieving all numerical columns from the dataset and generating a scatter matrix for these columns. The resulting
+    scatter plot provides visual representations of feature relationships. The function also adjusts axis labels for
+    readability and returns the final plot as a Matplotlib Figure object for further analysis and visualization.
+    ### Signs of High Risk
+    - The emergence of non-linear or random patterns across different feature pairs, suggesting complex relationships
+    unsuitable for linear assumptions.
+    - Lack of clear patterns or clusters, indicating weak or non-existent correlations among features, which could
+    challenge certain model types.
+    - Presence of outliers, as visual outliers can adversely influence the model's performance.
+    ### Strengths
+    - Provides insight into the multidimensional relationships among multiple features.
+    - Assists in identifying trends, correlations, and outliers that could affect model performance.
+    - Validates assumptions made during model creation, such as linearity.
+    - Versatile for application in both regression and classification tasks.
+    - Using Seaborn facilitates an intuitive and detailed visual exploration of data.
+    ### Limitations
+    - Scatter plot matrices may become cluttered and hard to decipher as the number of features increases.
+    - Primarily reveals pairwise relationships and may fail to illuminate complex interactions involving three or more
+    features.
+    - Being a visual tool, precision in quantitative analysis might be compromised.
+    - Outliers not clearly visible in plots can be missed, affecting model performance.
+    - Assumes that the dataset can fit into the computer's memory, which might not be valid for extremely large
+    datasets.
     """
-    name = "scatter_plot"
-    required_inputs = ["dataset"]
-    tasks = ["classification", "regression"]
-    tags = ["tabular_data", "visualization"]
-    def run(self):
-        columns = list(self.inputs.dataset.df.columns)
-        df = self.inputs.dataset.df[columns]
-        if not set(columns).issubset(set(df.columns)):
-            raise ValueError("Provided 'columns' must exist in the dataset")
-        g = sns.pairplot(data=df, diag_kind="kde")
-        for ax in g.axes.flatten():
-            # rotate x axis labels
-            ax.set_xlabel(ax.get_xlabel(), rotation=45)
-            # rotate y axis labels
-            ax.set_ylabel(ax.get_ylabel(), rotation=45)
-            # set y labels alignment
-            ax.yaxis.get_label().set_horizontalalignment("right")
-        # Get the current figure
-        fig = plt.gcf()
-        figures = []
-        figures.append(
-            Figure(
-                for_object=self,
-                key=self.key,
-                figure=fig,
-            )
-        )
-        plt.close("all")
-        return self.cache_results(
-            figures=figures,
-        )
+    g = sns.pairplot(data=dataset.df, diag_kind="kde")
+    for ax in g.axes.flatten():
+        # rotate x axis labels
+        ax.set_xlabel(ax.get_xlabel(), rotation=45)
+        # rotate y axis labels
+        ax.set_ylabel(ax.get_ylabel(), rotation=45)
+        # set y labels alignment
+        ax.yaxis.get_label().set_horizontalalignment("right")
+    # Get the current figure
+    fig = plt.gcf()
+    figures = []
+    figures.append(fig)
+    plt.close("all")
+    return tuple(figures)

validmind/tests/data_validation/SeasonalDecompose.py CHANGED Viewed

@@ -19,41 +19,45 @@ logger = get_logger(__name__)
 class SeasonalDecompose(Metric):
     """
-    Decomposes dataset features into observed, trend, seasonal, and residual components to identify patterns and
-    validate dataset.
-    **Purpose**: This test utilizes the Seasonal Decomposition of Time Series by Loess (STL) method to decompose a
-    dataset into its fundamental components: observed, trend, seasonal, and residuals. The purpose is to identify
-    implicit patterns, majorly any seasonality, in the dataset's features which aid in developing a more comprehensive
-    understanding and effectively validating the dataset.
-    **Test Mechanism**: The testing process exploits the `seasonal_decompose` function from the
-    `statsmodels.tsa.seasonal` library to evaluate each feature in the dataset. It isolates each feature into four
-    components: observed, trend, seasonal, and residuals, and generates essentially six subplot graphs per feature for
-    visual interpretation of the results. Prior to the seasonal decomposition, non-finite values are scrutinized and
-    removed thus, ensuring reliability in the analysis.
-    **Signs of High Risk**:
-    - **Non-Finiteness**: If a dataset carries too many non-finite values it might flag high risk as these values are
+    Assesses patterns and seasonality in a time series dataset by decomposing its features into foundational components.
+    ### Purpose
+    The Seasonal Decompose test aims to decompose the features of a time series dataset into their fundamental
+    components: observed, trend, seasonal, and residuals. By utilizing the Seasonal Decomposition of Time Series by
+    Loess (STL) method, the test identifies underlying patterns, predominantly seasonality, in the dataset's features.
+    This aids in developing a more comprehensive understanding of the dataset, which in turn facilitates more effective
+    model validation.
+    ### Test Mechanism
+    The testing process leverages the `seasonal_decompose` function from the `statsmodels.tsa.seasonal` library to
+    evaluate each feature in the dataset. It isolates each feature into four components—observed, trend, seasonal, and
+    residuals—and generates six subplot graphs per feature for visual interpretation. Prior to decomposition, the test
+    scrutinizes and removes any non-finite values, ensuring the reliability of the analysis.
+    ### Signs of High Risk
+    - **Non-Finiteness**: Datasets with a high number of non-finite values may flag as high risk since these values are
     omitted before conducting the seasonal decomposition.
-    - **Frequent Warnings**: The test could be at risk if it chronically fails to infer frequency for a scrutinized
-    feature.
-    - **High Seasonality**: A high seasonal component could potentially render forecasts unreliable due to overwhelming
-    seasonal variation.
-    **Strengths**:
-    - **Seasonality Detection**: The code aptly discerns hidden seasonality patterns in the features of datasets.
-    - **Visualization**: The test facilitates interpretation and comprehension via graphical representations.
-    - **Unrestricted Usage**: The code is not confined to any specific regression model, thereby promoting wide-ranging
-    applicability.
-    **Limitations**:
-    - **Dependence on Assumptions**: The test presumes that features in the dataset are periodically distributed. If no
-    frequency could be inferred for a variable, that feature is excluded from the test.
-    - **Handling Non-finite Values**: The test disregards non-finite values during the analysis which could potentially
-    result in incomplete understanding of the dataset.
-    - **Unreliability with Noisy Datasets**: The test tends to produce unreliable results when used with heavy noise
-    present in the dataset.
+    - **Frequent Warnings**: Chronic failure to infer the frequency for a scrutinized feature indicates high risk.
+    - **High Seasonality**: A significant seasonal component could potentially render forecasts unreliable due to
+    overwhelming seasonal variation.
+    ### Strengths
+    - **Seasonality Detection**: Accurately discerns hidden seasonality patterns in dataset features.
+    - **Visualization**: Facilitates interpretation and comprehension through graphical representations.
+    - **Unrestricted Usage**: Not confined to any specific regression model, promoting wide-ranging applicability.
+    ### Limitations
+    - **Dependence on Assumptions**: Assumes that dataset features are periodically distributed. Features with no
+    inferable frequency are excluded from the test.
+    - **Handling Non-Finite Values**: Disregards non-finite values during analysis, potentially resulting in an
+    incomplete understanding of the dataset.
+    - **Unreliability with Noisy Datasets**: Produces unreliable results when used with datasets that contain heavy
+    noise.
     """
     name = "seasonal_decompose"

validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

validmind 2.5.8py3-none-any.whl → 2.5.18py3-none-any.whl