PyPI - validmind - Versions diffs - 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl - Mend

validmind 2.1.0py3-none-any.whl → 2.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py ADDED Viewed

@@ -0,0 +1,96 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from itertools import combinations
+import numpy as np
+import pandas as pd
+import plotly.express as px
+from sklearn.metrics.pairwise import cosine_similarity
+def CosineSimilarityComparison(dataset, models):
+    """
+    Computes pairwise cosine similarities between model embeddings and visualizes the results through bar charts,
+    alongside compiling a comprehensive table of descriptive statistics for each model pair.
+    **Purpose:**
+    This function is designed to analyze and compare the embeddings produced by different models using Cosine Similarity.
+    Cosine Similarity, a measure calculating the cosine of the angle between two vectors, is widely used to determine
+    the alignment or similarity between vectors in high-dimensional spaces, such as text embeddings. This analysis helps
+    to understand how similar or different the models' predictions are in terms of embedding generation.
+    **Test Mechanism:**
+    The function begins by computing the embeddings for each model using the provided dataset. It then calculates the
+    cosine similarity for every possible pair of models, generating a similarity matrix. Each element of this matrix
+    represents the cosine similarity between two model embeddings. The function flattens this matrix and uses it to
+    create a bar chart for each model pair, visualizing their similarity distribution. Additionally, it compiles a table
+    with descriptive statistics (mean, median, standard deviation, minimum, and maximum) for the similarities of each
+    pair, including a reference to the compared models.
+    **Signs of High Risk:**
+    - A high concentration of cosine similarity values close to 1 could suggest that the models are producing very
+      similar embeddings, which could be a sign of redundancy or lack of diversity in model training or design.
+    - Conversely, very low similarity values near -1 indicate strong dissimilarity, potentially highlighting models
+      that are too divergent, possibly focusing on very different features of the data.
+    **Strengths:**
+    - Enables detailed comparisons between multiple models' embedding strategies through visual and statistical means.
+    - Helps identify which models produce similar or dissimilar embeddings, useful for tasks requiring model diversity.
+    - Provides quantitative and visual feedback on the degree of similarity, enhancing interpretability of model
+      behavior in embedding spaces.
+    **Limitations:**
+    - The analysis is confined to the comparison of embeddings and does not assess the overall performance of the models
+      in terms of their primary tasks (e.g., classification, regression).
+    - Assumes that the models are suitable for generating comparable embeddings, which might not always be the case,
+      especially across different types of models.
+    - Interpretation of results is heavily dependent on the understanding of Cosine Similarity and the nature of high-dimensional
+      embedding spaces.
+    """
+    figures = []
+    # Initialize a list to store data for the DataFrame
+    all_stats = []
+    # Generate all pairs of models for comparison
+    for model_A, model_B in combinations(models, 2):
+        embeddings_A = np.stack(dataset.y_pred(model_A))
+        embeddings_B = np.stack(dataset.y_pred(model_B))
+        # Calculate pairwise cosine similarity
+        similarity_matrix = cosine_similarity(embeddings_A, embeddings_B)
+        similarities = similarity_matrix.flatten()
+        # Generate statistics and add model combination as a column
+        stats_data = {
+            "Combination": f"{model_A.input_id} vs {model_B.input_id}",
+            "Mean": np.mean(similarities),
+            "Median": np.median(similarities),
+            "Standard Deviation": np.std(similarities),
+            "Minimum": np.min(similarities),
+            "Maximum": np.max(similarities),
+        }
+        all_stats.append(stats_data)
+        # Generate an index for each similarity value
+        indices = range(len(similarities))
+        # Create the bar chart using Plotly
+        fig = px.bar(
+            x=indices,
+            y=similarities,
+            labels={"x": "Pair Index", "y": "Cosine Similarity"},
+            title=f"Cosine Similarity - {model_A.input_id} vs {model_B.input_id}",
+        )
+        fig.update_layout(xaxis_title="Pair Index", yaxis_title="Cosine Similarity")
+        figures.append(fig)
+    # Create a DataFrame from all collected statistics
+    stats_df = pd.DataFrame(all_stats)
+    return (stats_df, *tuple(figures))

validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py ADDED Viewed

@@ -0,0 +1,71 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import numpy as np
+import plotly.express as px
+from sklearn.metrics.pairwise import cosine_similarity
+def CosineSimilarityHeatmap(
+    dataset,
+    model,
+    title="Cosine Similarity Matrix",
+    color="Cosine Similarity",
+    xaxis_title="Index",
+    yaxis_title="Index",
+    color_scale="Blues",
+):
+    """
+    Generates an interactive heatmap to visualize the cosine similarities among embeddings derived from a given model.
+    **Purpose:**
+    This function is designed to visually analyze the cosine similarities of embeddings from a specific model.
+    Cosine similarity, a measure of the cosine of the angle between two vectors, aids in understanding the
+    orientation and similarity of vectors in multi-dimensional space. This is particularly valuable for exploring
+    text embeddings and their relative similarities among documents, words, or phrases.
+    **Test Mechanism:**
+    The function operates through a sequence of steps to visualize cosine similarities. Initially,
+    embeddings are extracted for each dataset entry using the designated model. Following this,
+    the function computes the pairwise cosine similarities among these embeddings. The computed similarities
+    are then displayed in an interactive heatmap.
+    **Signs of High Risk:**
+    - High similarity values (close to 1) across the heatmap might not always be indicative of a risk;
+    however, in contexts where diverse perspectives or features are desired, this could suggest a lack of
+    diversity in the model's learning process or potential redundancy.
+    - Similarly, low similarity values (close to -1) indicate strong dissimilarity, which could be beneficial in
+    scenarios demanding diverse outputs. However, in cases where consistency is needed, these low values might
+    highlight that the model is unable to capture a coherent set of features from the data, potentially leading to poor performance on related tasks.
+    **Strengths:**
+    - Provides an interactive and intuitive visual representation of embedding similarities, facilitating easy exploration and analysis.
+    - Allows customization of visual elements such as title, axis labels, and color scale to suit specific analytical needs and preferences.
+    **Limitations:**
+    - As the number of embeddings increases, the effectiveness of the heatmap might diminish due to overcrowding, making it hard to discern detailed similarities.
+    - The interpretation of the heatmap heavily relies on the appropriate setting of the color scale, as incorrect settings can lead to misleading visual interpretations.
+    """
+    embeddings = np.stack(dataset.y_pred(model))
+    # Calculate pairwise cosine similarity
+    similarity_matrix = cosine_similarity(embeddings)
+    # Create the heatmap using Plotly
+    fig = px.imshow(
+        similarity_matrix,
+        labels=dict(x=xaxis_title, y=yaxis_title, color=color),
+        text_auto=True,
+        aspect="auto",
+        color_continuous_scale=color_scale,
+    )
+    fig.update_layout(
+        title=f"{title} - {model.input_id}",
+        xaxis_title=xaxis_title,
+        yaxis_title=yaxis_title,
+    )
+    return fig

validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py ADDED Viewed

@@ -0,0 +1,92 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+from itertools import combinations
+import numpy as np
+import pandas as pd
+import plotly.express as px
+from sklearn.metrics.pairwise import euclidean_distances
+def EuclideanDistanceComparison(dataset, models):
+    """
+    Computes pairwise Euclidean distances between model embeddings and visualizes the results through bar charts,
+    alongside compiling a comprehensive table of descriptive statistics for each model pair.
+    **Purpose:**
+    This function is designed to analyze and compare the embeddings produced by different models using Euclidean Distance.
+    Euclidean Distance measures the "ordinary" straight-line distance between two points in Euclidean space, providing a
+    straightforward metric to assess the absolute differences between vectors. This analysis helps in understanding the
+    magnitude of dissimilarity between the embeddings generated by different models, which is crucial for tasks that require
+    distinctive model responses or feature separations.
+    **Test Mechanism:**
+    The function begins by computing the embeddings for each model using the provided dataset. It then calculates the
+    Euclidean distance for every possible pair of models, generating a distance matrix. Each element of this matrix
+    represents the Euclidean distance between two model embeddings. The function flattens this matrix and uses it to
+    create a bar chart for each model pair, visualizing their distance distribution. Additionally, it compiles a table
+    with descriptive statistics (mean, median, standard deviation, minimum, and maximum) for the distances of each
+    pair, including a reference to the compared models.
+    **Signs of High Risk:**
+    - Very high distance values could suggest that the models are focusing on completely different features or aspects
+      of the data, which might be undesirable for ensemble methods or similar applications where some degree of
+      consensus is expected.
+    - Extremely low distances across different models might indicate redundancy, suggesting that the models are not
+      providing diverse enough perspectives on the data.
+    **Strengths:**
+    - Provides a clear and quantifiable measure of how different the embeddings from various models are.
+    - Useful for identifying outlier models or those that behave significantly differently from others in a group.
+    **Limitations:**
+    - Euclidean distance can be sensitive to the scale of the data, meaning that preprocessing steps like normalization
+      might be necessary to ensure meaningful comparisons.
+    - Does not consider the orientation or angle between vectors, focusing purely on magnitude differences.
+    """
+    figures = []
+    all_stats = []
+    # Generate all pairs of models for comparison
+    for model_A, model_B in combinations(models, 2):
+        embeddings_A = np.stack(dataset.y_pred(model_A))
+        embeddings_B = np.stack(dataset.y_pred(model_B))
+        # Calculate pairwise Euclidean distances
+        distance_matrix = euclidean_distances(embeddings_A, embeddings_B)
+        distances = distance_matrix.flatten()
+        # Generate statistics and add model combination as a column
+        stats_data = {
+            "Combination": f"{model_A.input_id} vs {model_B.input_id}",
+            "Mean": np.mean(distances),
+            "Median": np.median(distances),
+            "Standard Deviation": np.std(distances),
+            "Minimum": np.min(distances),
+            "Maximum": np.max(distances),
+        }
+        all_stats.append(stats_data)
+        # Generate an index for each distance value
+        indices = range(len(distances))
+        # Create the bar chart using Plotly
+        fig = px.bar(
+            x=indices,
+            y=distances,
+            labels={"x": "Pair Index", "y": "Euclidean Distance"},
+            title=f"Euclidean Distance - {model_A.input_id} vs {model_B.input_id}",
+        )
+        fig.update_layout(xaxis_title="Pair Index", yaxis_title="Euclidean Distance")
+        figures.append(fig)
+    # Create a DataFrame from all collected statistics
+    stats_df = pd.DataFrame(all_stats)
+    return (stats_df, *tuple(figures))

validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py ADDED Viewed

@@ -0,0 +1,69 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import numpy as np
+import plotly.express as px
+from sklearn.metrics.pairwise import euclidean_distances
+def EuclideanDistanceHeatmap(
+    dataset,
+    model,
+    title="Euclidean Distance Matrix",
+    color="Euclidean Distance",
+    xaxis_title="Index",
+    yaxis_title="Index",
+    color_scale="Blues",
+):
+    """
+    Generates an interactive heatmap to visualize the Euclidean distances among embeddings derived from a given model.
+    **Purpose:**
+    This function visualizes the Euclidean distances between embeddings generated by a model, offering insights into the
+    absolute differences between data points. Euclidean distance, a fundamental metric in data analysis, measures the
+    straight-line distance between two points in Euclidean space. It is particularly useful for understanding spatial
+    relationships and clustering tendencies in high-dimensional data.
+    **Test Mechanism:**
+    The function operates through a streamlined process: firstly, embeddings are extracted for each dataset entry using the specified model.
+    Subsequently, it computes the pairwise Euclidean distances among these embeddings. The results are then visualized in an interactive heatmap format,
+    where each cell's color intensity correlates with the distance magnitude between pairs of embeddings, providing a visual assessment of these distances.
+    **Signs of High Risk:**
+    - Uniform Distances: Uniformly low distances across the heatmap might suggest a lack of variability in the data or
+    model overfitting, where the model fails to distinguish between distinct data points effectively.
+    - High Variability: Conversely, excessive variability in distances could indicate inconsistent data representation,
+    potentially leading to unreliable model predictions.
+    **Strengths:**
+    - Provides a direct, intuitive visual representation of distances between embeddings, aiding in the detection of patterns or anomalies.
+    - Allows customization of visual aspects such as the heatmap's title, axis labels, and color scale, adapting to various analytical needs.
+    **Limitations:**
+    - The interpretation of distances can be sensitive to the scale of data; normalization might be necessary for meaningful analysis.
+    - Large datasets may lead to dense, cluttered heatmaps, making it difficult to discern individual distances, potentially requiring
+    techniques like data sampling or dimensionality reduction for clearer visualization.
+    """
+    embeddings = np.stack(dataset.y_pred(model))
+    # Calculate pairwise Euclidean distance
+    distance_matrix = euclidean_distances(embeddings)
+    # Create the heatmap using Plotly
+    fig = px.imshow(
+        distance_matrix,
+        labels=dict(x=xaxis_title, y=yaxis_title, color=color),
+        text_auto=True,
+        aspect="auto",
+        color_continuous_scale=color_scale,
+    )
+    fig.update_layout(
+        title=f"{title} - {model.input_id}",
+        xaxis_title=xaxis_title,
+        yaxis_title=yaxis_title,
+    )
+    return fig

validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py ADDED Viewed

@@ -0,0 +1,78 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import itertools
+import numpy as np
+import pandas as pd
+import plotly.express as px
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+def PCAComponentsPairwisePlots(dataset, model, n_components=3):
+    """
+    Generates scatter plots for pairwise combinations of principal component analysis (PCA) components of model embeddings.
+    **Purpose:**
+    This function visualizes the principal components of embeddings derived from a specified model. Principal Component Analysis (PCA)
+    is a statistical technique that emphasizes variation and uncovers strong patterns in a dataset.
+    It transforms the original variables into new, uncorrelated variables (principal components) that maximize variance.
+    **Test Mechanism:**
+    The function follows a sequential process to visualize PCA components effectively.
+    It starts by extracting embeddings from the dataset, utilizing the model specified by the user.
+    These embeddings are then standardized to ensure zero mean and unit variance, which is crucial to prevent
+    any single feature from dominating due to scale—this standardization is a critical preprocessing step for PCA.
+    Following this, the function calculates the specified number of principal components.
+    The core of the visualization process involves creating scatter plots for each pairwise combination of these principal components.
+    **Signs of High Risk:**
+    - If the principal components do not account for a significant portion of the variance, it may suggest that PCA is not capturing the essential structures of the data.
+    - Similarity in scatter plots across different pairs of components could indicate redundancy in the components, suggesting that fewer dimensions might be sufficient to represent the data.
+    **Strengths:**
+    - Enables a simplified visualization of multivariate data, helping to identify patterns across many variables effectively.
+    - Provides a clear depiction of the directions of maximum variance in the data, which is valuable for feature selection and dimensionality reduction.
+    **Limitations:**
+    - PCA's effectiveness hinges on the scaling of the variables; improper standardization can lead to misleading interpretations.
+    - The interpretation of principal components can be challenging, especially if they capture less significant variances or are difficult to relate back to the original features.
+    """
+    # Get embeddings from the dataset using the model
+    embeddings = np.stack(dataset.y_pred(model))
+    # Standardize the embeddings
+    scaler = StandardScaler()
+    embeddings_scaled = scaler.fit_transform(embeddings)
+    # Perform PCA
+    pca = PCA(n_components=n_components)
+    pca_results = pca.fit_transform(embeddings_scaled)
+    # Prepare DataFrame for Plotly
+    pca_df = pd.DataFrame(
+        pca_results, columns=[f"PC{i+1}" for i in range(n_components)]
+    )
+    # List to store each plot
+    plots = []
+    # Create plots for each pair of principal components
+    for pc1, pc2 in itertools.combinations(range(1, n_components + 1), 2):
+        fig = px.scatter(
+            pca_df,
+            x=f"PC{pc1}",
+            y=f"PC{pc2}",
+            title=f"{getattr(model, 'input_id', 'Unknown Model')} (PC{pc1} vs PC{pc2})",
+            labels={
+                f"PC{pc1}": f"Principal Component {pc1}",
+                f"PC{pc2}": f"Principal Component {pc2}",
+            },
+        )
+        plots.append(fig)
+    # Return the list of plots as a tuple
+    return tuple(plots)

validmind/tests/model_validation/embeddings/StabilityAnalysis.py CHANGED Viewed

@@ -6,8 +6,10 @@ from abc import abstractmethod
 from typing import List
 import numpy as np
+import plotly.express as px
 from sklearn.metrics.pairwise import cosine_similarity
+from validmind.logging import get_logger
 from validmind.vm_models import (
     Figure,
     ResultSummary,
@@ -17,13 +19,14 @@ from validmind.vm_models import (
     ThresholdTestResult,
 )
+logger = get_logger(__name__)
 class StabilityAnalysis(ThresholdTest):
     """Base class for embeddings stability analysis tests"""
     required_inputs = ["model", "dataset"]
     default_params = {
-        "text_column": None,
         "mean_similarity_threshold": 0.7,
     }
     metadata = {
@@ -61,25 +64,22 @@ class StabilityAnalysis(ThresholdTest):
     def run(self):
         # Perturb the test dataset
-        col = self.params.get("text_column")
-        if col is None:
-            raise ValueError(
-                "The `text_column` parameter must be provided to the StabilityAnalysis test."
-            )
+        original = self.inputs.dataset.df
+        perturbed = original.copy()
+        perturbed.update(
+            perturbed.select_dtypes(include="object").applymap(self.perturb_data)
+        )
-        original_data_df = self.inputs.dataset.df[col]
-        perturbed_data_df = original_data_df.copy()
-        perturbed_data_df = perturbed_data_df.apply(self.perturb_data)
+        logger.debug(f"Original data: {original}")
+        logger.debug(f"Perturbed data: {perturbed}")
         # Compute embeddings for the original and perturbed dataset
-        original_embeddings = self.inputs.model.predict(original_data_df)
-        perturbed_embeddings = self.inputs.model.predict(perturbed_data_df)
+        original_embeddings = self.inputs.dataset.y_pred(self.inputs.model)
+        perturbed_embeddings = np.stack(self.inputs.model.predict(perturbed))
         # Compute cosine similarities between original and perturbed embeddings
         similarities = cosine_similarity(
-            original_embeddings,
-            perturbed_embeddings,
+            original_embeddings, perturbed_embeddings
         ).diagonal()
         mean = np.mean(similarities)
@@ -91,15 +91,26 @@ class StabilityAnalysis(ThresholdTest):
         # Determine if the test passed based on the mean similarity and threshold
         passed = mean > self.params["mean_similarity_threshold"]
-        # Plot the distribution of cosine similarities using plotly
-        import plotly.express as px
-        fig = px.histogram(
-            x=similarities.flatten(),
-            nbins=100,
-            title="Cosine Similarity Distribution",
-            labels={"x": "Cosine Similarity"},
-        )
+        figures = [
+            px.histogram(
+                x=similarities.flatten(),
+                nbins=100,
+                title="Cosine Similarity Distribution",
+                labels={"x": "Cosine Similarity"},
+            ),
+            px.density_contour(
+                x=similarities.flatten(),
+                nbinsx=100,
+                title="Cosine Similarity Density",
+                labels={"x": "Cosine Similarity"},
+                marginal_x="histogram",
+            ),
+            px.box(
+                x=similarities.flatten(),
+                labels={"x": "Cosine Similarity"},
+                title="Cosine Similarity Box Plot",
+            ),
+        ]
         # For this example, we are not caching the results as done in the reference `run` method
         return self.cache_results(
@@ -121,6 +132,7 @@ class StabilityAnalysis(ThresholdTest):
                     key=self.name,
                     figure=fig,
                 )
+                for fig in figures
             ],
             passed=passed,
         )

validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py CHANGED Viewed

@@ -55,6 +55,9 @@ class StabilityAnalysisKeyword(StabilityAnalysis):
     }
     def perturb_data(self, data: str):
+        if not isinstance(data, str):
+            return data
         # Tokenize the string
         tokens = re.findall(r"[\w']+[.,!?;]?|[\w']+", data)
         modified_tokens = []

validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py CHANGED Viewed

@@ -114,9 +114,15 @@ class StabilityAnalysisRandomNoise(StabilityAnalysis):
     name = "Text Embeddings Stability Analysis to Random Noise"
     default_params = {
         **StabilityAnalysis.default_params,
+        "probability": 0.02,
     }
-    def perturb_data(self, data, probability=0.02):
+    def perturb_data(self, data):
+        if not isinstance(data, str):
+            return data
+        probability = self.params["probability"]
         # Tokenize the string based on spaces
         words = data.split()

validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py CHANGED Viewed

@@ -65,6 +65,9 @@ class StabilityAnalysisSynonyms(StabilityAnalysis):
     }
     def perturb_data(self, data):
+        if not isinstance(data, str):
+            return data
         # download the nltk wordnet
         nltk.download("wordnet", quiet=True)

validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py CHANGED Viewed

@@ -61,6 +61,9 @@ class StabilityAnalysisTranslation(StabilityAnalysis):
     }
     def perturb_data(self, data: str):
+        if not isinstance(data, str):
+            return data
         source_lang = self.params["source_lang"]
         target_lang = self.params["target_lang"]

validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py ADDED Viewed

@@ -0,0 +1,99 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import itertools
+import numpy as np
+import pandas as pd
+import plotly.express as px
+from sklearn.manifold import TSNE
+from sklearn.preprocessing import StandardScaler
+def TSNEComponentsPairwisePlots(
+    dataset,
+    model,
+    n_components=2,
+    perplexity=30,
+    title="t-SNE",
+):
+    """
+    Plots individual scatter plots for pairwise combinations of t-SNE components of embeddings.
+    **Purpose:**
+    This function creates scatter plots for each pairwise combination of t-SNE components derived from model embeddings.
+    t-SNE (t-Distributed Stochastic Neighbor Embedding) is a machine learning algorithm for dimensionality reduction that
+    is particularly well-suited for the visualization of high-dimensional datasets.
+    **Test Mechanism:**
+    The function begins by extracting embeddings from the provided dataset using the specified model.
+    These embeddings are then standardized to ensure that each dimension contributes equally to the distance computation.
+    Following this, the t-SNE algorithm is applied to reduce the dimensionality of the data, with the number of components
+    specified by the user. The results are plotted using Plotly, creating scatter plots for each unique pair of components
+    if more than one component is specified.
+    **Signs of High Risk:**
+    - If the scatter plots show overlapping clusters or indistinct groupings, it might suggest that the
+    t-SNE parameters (such as perplexity) are not optimally set for the given data, or the data itself does not exhibit clear, separable clusters.
+    - Similar plots across different pairs of components could indicate redundancy in the components generated by t-SNE,
+    suggesting that fewer dimensions might be sufficient to represent the data's structure.
+    **Strengths:**
+    - Provides a visual exploration tool for high-dimensional data, simplifying the detection of patterns and clusters which are not apparent in higher dimensions.
+    - Interactive plots generated by Plotly enhance user engagement and allow for a deeper dive into specific areas of the plot, aiding in detailed data analysis.
+    **Limitations:**
+    - The effectiveness of t-SNE is highly dependent on the choice of parameters like perplexity and the number of components,
+    which might require tuning and experimentation for optimal results.
+    - t-SNE visualizations can be misleading if interpreted without considering the stochastic nature of the algorithm;
+    two runs with the same parameters might yield different visual outputs, necessitating multiple runs for a consistent interpretation.
+    """
+    # Get embeddings from the dataset using the model
+    embeddings = np.stack(dataset.y_pred(model))
+    # Standardize the embeddings
+    scaler = StandardScaler()
+    embeddings_scaled = scaler.fit_transform(embeddings)
+    # Perform t-SNE
+    tsne = TSNE(n_components=n_components, perplexity=perplexity)
+    tsne_results = tsne.fit_transform(embeddings_scaled)
+    # Prepare DataFrame for Plotly
+    tsne_df = pd.DataFrame(
+        tsne_results, columns=[f"Component {i+1}" for i in range(n_components)]
+    )
+    # List to store each plot
+    plots = []
+    # Create plots for each pair of t-SNE components (if n_components > 1)
+    if n_components > 1:
+        for comp1, comp2 in itertools.combinations(range(1, n_components + 1), 2):
+            fig = px.scatter(
+                tsne_df,
+                x=f"Component {comp1}",
+                y=f"Component {comp2}",
+                title=f"{title} - {getattr(model, 'input_id', 'Unknown Model')}",
+                labels={
+                    f"Component {comp1}": f"Component {comp1}",
+                    f"Component {comp2}": f"Component {comp2}",
+                },
+            )
+            plots.append(fig)
+    else:
+        fig = px.scatter(
+            tsne_df,
+            x="Component 1",
+            y="Component 1",
+            title=f"{title} - {getattr(model, 'input_id', 'Unknown Model')}",
+            labels={
+                "Component 1": "Component 1",
+            },
+        )
+        plots.append(fig)
+    # Return the list of plots as a tuple
+    return tuple(plots)

validmind 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl

validmind 2.1.0py3-none-any.whl → 2.2.2py3-none-any.whl