PyPI - dstklib - Versions diffs - 1.0.2__py3-none-any.whl → 2.0.1__py3-none-any.whl - Mend

dstklib 1.0.2py3-none-any.whl → 2.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

dstk/__init__.py +10 -12
dstk/adaptors/__init__.py +2 -0
dstk/adaptors/adaptors.py +91 -0
dstk/adaptors/typeguards.py +141 -0
dstk/hooks/__init__.py +2 -0
dstk/hooks/hook_tools.py +89 -0
dstk/hooks/type_conversion.py +40 -0
dstk/lib_types/__init__.py +2 -3
dstk/lib_types/dstk_types.py +188 -16
dstk/lib_types/plotly_types.py +1 -0
dstk/method_index.py +32 -0
dstk/models/__init__.py +2 -0
dstk/models/model_tools.py +83 -0
dstk/models/models.py +191 -0
dstk/modules/__init__.py +10 -0
dstk/modules/count_models.py +91 -0
dstk/modules/data_visualization/__init__.py +2 -0
dstk/modules/data_visualization/clustering.py +129 -0
dstk/modules/data_visualization/embeddings.py +101 -0
dstk/modules/geometric_distance.py +114 -0
dstk/modules/ngrams.py +156 -0
dstk/modules/predict_models.py +109 -0
dstk/modules/text_matrix_builder.py +55 -0
dstk/modules/text_processor.py +100 -0
dstk/modules/tokenizer.py +139 -0
dstk/modules/weight_matrix.py +65 -0
dstk/templates/__init__.py +2 -0
dstk/templates/rules.py +59 -0
dstk/templates/templates.py +231 -0
dstk/workflows/__init__.py +2 -0
dstk/workflows/stage_workflows.py +55 -0
dstk/workflows/workflow_tools.py +383 -0
dstklib-2.0.1.dist-info/METADATA +377 -0
dstklib-2.0.1.dist-info/RECORD +43 -0
dstk/collocations.py +0 -121
dstk/count_models.py +0 -112
dstk/geometric_distance.py +0 -107
dstk/lib_types/matplotlib_types.py +0 -4
dstk/lib_types/nltk_types.py +0 -1
dstk/matrix_base.py +0 -113
dstk/pipeline_tools.py +0 -27
dstk/pipelines.py +0 -114
dstk/plot_embeddings.py +0 -240
dstk/predict_models.py +0 -189
dstk/text_matrix_builder.py +0 -87
dstk/text_processor.py +0 -450
dstk/weight_matrix.py +0 -71
dstk/workflow_tools.py +0 -257
dstklib-1.0.2.dist-info/METADATA +0 -369
dstklib-1.0.2.dist-info/RECORD +0 -28
{dstklib-1.0.2.dist-info → dstklib-2.0.1.dist-info}/LICENSE +0 -0
{dstklib-1.0.2.dist-info → dstklib-2.0.1.dist-info}/WHEEL +0 -0
{dstklib-1.0.2.dist-info → dstklib-2.0.1.dist-info}/top_level.txt +0 -0

dstk/models/model_tools.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""
+Module for orchestrating and automating the execution of multiple workflows and hooks.
+Provides the ModelBuilder class which manages a sequence of WorkflowBuilder, StageWorkflowBuilder, or Hook instances, allowing flexible, stepwise processing of input data through these workflows.
+Features:
+* Sequential execution of workflows with intermediate results.
+* Options to retrieve results from specific workflows, all workflows, or only the final output.
+* Supports integration with various workflow types for modular model construction.
+This module facilitates building complex processing models by combining and controlling multiple modular workflows in a unified manner.
+"""
+from ..workflows import WorkflowBuilder, StageWorkflowBuilder
+from ..hooks import Hook
+from typing import Any
+from ..lib_types import StepResult, StepGenerator, ResultGenerator
+class ModelBuilder:
+    """
+    Automates the execution of a sequence of workflows on a WorkflowBuilder or Hook subclass.
+    :param workflows: A list of Workflow, StageWorkflows or Hook to execute.
+    :type workflows: list[WorkflowBuilder | StageWorkflowBuilder | Hook]
+    Usage:
+    .. code-block:: python
+        CustomModel = ModelBuilder(workflows=[workflow1, workflow2, hook1])
+        final_result = CustomModel(input_data)
+    """
+    def __init__(self, workflows: list[WorkflowBuilder | StageWorkflowBuilder | Hook]):
+        """
+        Initializes WorkflowBuilder with given attributes.
+        """
+        self.workflows: list[WorkflowBuilder | StageWorkflowBuilder | Hook] = workflows
+    def _run(self, input_data: Any) -> StepGenerator:
+        """
+        Executes each workflow or hook sequentially on the input data, yielding intermediate results.
+        :param input_data: The initial data to be processed by the workflows.
+        :type input_data: Any
+        :return: A generator that yields StepResult objects containing the name of the workflow and the corresponding output after execution.
+        :rtype: StepGenerator
+        """
+        result: Any = input_data
+        for workflow in self.workflows:
+            result = workflow(result)
+            yield StepResult(name=workflow.name, result=result)
+    def __call__(self, input_data: Any, return_workflows: list[str] | None = None, return_all: bool = False) -> ResultGenerator | StepGenerator | Any:
+        """
+        Runs the workflows on the input data.
+        :param input_data: Input data to process.
+        :type input_data: Any
+        :param return_workflows: If provided, yields results only for these workflows. Defaults to None
+        :type return_workflows: list[str] or None
+        :param return_all: If True, yields results for all workflows. Defaults to False.
+        :type return_all: bool
+        :return: Final result, or a generator of step/workflow results.
+        :rtype: ResultGenerator | StepGenerator | Any
+        """
+        if return_workflows:
+            return (result for name, result in self._run(input_data) if name in return_workflows)
+        elif return_all:
+            return self._run(input_data)
+        else:
+            result = input_data
+            for _, result in self._run(input_data):
+                pass
+            return result

dstk/models/models.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""
+This module contains predefined and commonly used distributional semantic models. Each model is implemented as a high-level pipeline that integrates multiple stages of text processing, embedding generation, and similarity computation.
+Currently supported models:
+* *StandardModel*: A count-based model using a context window, PPMI weighting, and dimensionality reduction via SVD. Based on the description found in the book 'Distributional semantics' by Lenci & Sahlgren (2023).
+* *SGNSModel*: A prediction-based model using Word2Vec's Skip-Gram with Negative Sampling (SGNS), as described by Lenci & Sahlgren (2023).
+These pipelines are modular and composable, built from reusable workflows to support both experimentation and production use.
+Future versions of this module may include additional models and hybrid approaches.
+"""
+from ..workflows import WorkflowBuilder, TextProcessing, StageWorkflowBuilder, Wrapper
+from ..templates import TextMatrixBuilderTemplate,  WeightMatrixTemplate, CountModelsTemplate, GeometricDistanceTemplate, PredictModelsTemplate
+from .model_tools import ModelBuilder
+from ..hooks import ModelToDataframe, Hook
+from typing import Any
+from ..lib_types import Language, StepResult, StepGenerator, ResultGenerator
+def StandardModel(text: str, model: str | Language, custom_stop_words: list[str] | None = None, window_size: int = 2, n_components: int = 100, return_workflows: list[str] | None = None, return_all: bool = False)-> ResultGenerator | StepGenerator | Wrapper:
+    """
+    This pipeline generates word embeddings using the standard model as defined by (Lenci & Sahlgren 97). It preprocesses the text by removing stop words, lowering the words and segmenting the text using a context window. The co-occurrence matrix is weighted with PPMI and reduced with truncated SVD. Then, cosine similarity is appliad as the distance metric.
+    :param text: The text to extract the embeddings from.
+    :type text: str
+    :param model: The spaCy NLP model to tokenize the text.
+    :type model: str or Language
+    :param window_size: The size of the context window to segment the text. Defaults to 2.
+    :type window_size: int
+    :param n_components: The number of dimensions of the embeddings. Defaults to 100.
+    :type n_components: int
+    :param return_workflows: If provided, yields results only for these workflows. Defaults to None
+    :type return_workflows: list[str] or None
+    :param return_all: If True, yields results for all workflows. Defaults to False.
+    :type return_all: bool
+    :return: Wrapper for cosine_similarity and nearest_neighbors, or a generator of step/workflow results.
+    :rtype: ResultGenerator | StepGenerator | Wrapper
+    """
+    StandardTextWorkflow: StageWorkflowBuilder = TextProcessing(
+        name="ProcessedText",
+        workflows= {
+            "tokenizer": [
+                {"apply_model": {"model": model}},
+                {"get_tokens": {}},
+                {"remove_stop_words": {"custom_stop_words": custom_stop_words}}
+            ],
+            "ngrams": [
+                {"extract_ngrams": {"window_size": window_size}}
+            ],
+            "text_processor": [
+                {"tokens_to_text": {}},
+                {"to_lower": {}},
+                {"join": {}}
+            ]
+        }
+    )
+    StandardMatrix: WorkflowBuilder = WorkflowBuilder(
+        name="Matrix",
+        module_name="text_matrix_builder",
+        template=TextMatrixBuilderTemplate,
+        workflow=[
+            {"create_dtm": {}},
+            {"create_co_occurrence_matrix": {}}
+        ]
+    )
+    StandardWeightMatrix: WorkflowBuilder = WorkflowBuilder(
+       name="WeightedMatrix",
+       module_name="weight_matrix",
+       template=WeightMatrixTemplate,
+       workflow=[
+            {"pmi": {"positive": True}}
+       ]
+    )
+    StandardCountModels: WorkflowBuilder = WorkflowBuilder(
+        name="Embeddings",
+        module_name="count_models",
+        template=CountModelsTemplate,
+        workflow=[
+            {"scale_matrix": {}},
+            {"svd_embeddings": {"n_components": n_components}}
+        ]
+    )
+    StandardGeometricDistance: WorkflowBuilder = WorkflowBuilder(
+        name="GeometricDistance",
+        module_name="geometric_distance",
+        template=GeometricDistanceTemplate,
+        workflow=[
+            {"cos_similarity": {}},
+            {"nearest_neighbors": {}}
+        ],
+        wrapper=True
+    )
+    Model: ModelBuilder = ModelBuilder(
+        workflows=[
+            StandardTextWorkflow,
+            StandardMatrix,
+            StandardWeightMatrix,
+            StandardCountModels,
+            StandardGeometricDistance
+        ]
+    )
+    return Model(input_data=text, return_workflows=return_workflows, return_all=return_all)
+def SGNSModel(text: str, model: str | Language, path: str, return_workflows: list[str] | None = None, return_all: bool = False, **kwargs) -> StepGenerator | ResultGenerator | Any:
+    """
+    This pipeline generates word embeddings using Skip-Gram with Negative Sampling (SGNS) as defined by (Lenci & Sahlgren 162). It preprocesses the text by extracting the sentences, removing stop words and lowering them. The embeddings are extracted by using word2vec to do SGNS. Then, cosine similarity is appliad as the distance metric.
+    :param text: The text to extract the embeddings from.
+    :type text: str
+    :param model: The spaCy NLP model to tokenize the text.
+    :type model: str or Language
+    :param path: The path to save the processed senteces.
+    :type path: str
+    :param kwargs:  Additional keyword arguments to pass to gensim.models.Word2Vec. Common options include:
+        * **vector_size:** Size of the word embedding vectors.
+        * **workers:** Number of CPU cores to be used during the training process.
+        * **negative:** Specifies how many "noise words" to sample for each positive example during training. Typical values range from 5 to 20. Higher values make training slower but can improve embedding quality.
+        * **window (int):** Maximum distance between the current and predicted word.
+        * **min_count (int):** Ignores all words with total frequency lower than this.
+    For more information check: https://radimrehurek.com/gensim/models/word2vec.html
+    :param return_workflows: If provided, yields results only for these workflows. Defaults to None
+    :type return_workflows: list[str] or None
+    :param return_all: If True, yields results for all workflows. Defaults to False.
+    :type return_all: bool
+    :return: Wrapper for cosine_similarity and nearest_neighbors, or a generator of step/workflow results.
+    :rtype: ResultGenerator | StepGenerator | Wrapper
+    """
+    PredictTextWorkflow: StageWorkflowBuilder = TextProcessing(
+        name="ProcessedText",
+        workflows= {
+            "tokenizer": [
+                {"apply_model": {"model": model}},
+                {"get_sentences": {}},
+                {"remove_stop_words": {}}
+            ],
+            "text_processor": [
+                {"tokens_to_text": {}},
+                {"to_lower": {}},
+                {"join": {}},
+                {"save_to_file": {"path": path}}
+            ]
+        }
+    )
+    SGNSPredictWorkflow: WorkflowBuilder = WorkflowBuilder(
+        name="SGNS",
+        module_name="predict_models",
+        template=PredictModelsTemplate,
+        workflow=[
+            {"word2vec": {"sg": 1, **kwargs}}
+        ]
+    )
+    PredictGeometricDistance: WorkflowBuilder = WorkflowBuilder(
+        name="GeometricDistance",
+        module_name="geometric_distance",
+        template=GeometricDistanceTemplate,
+        workflow=[
+            {"cos_similarity": {}},
+            {"nearest_neighbors": {}}
+        ],
+        wrapper=True
+    )
+    EmbeddingsHook: Hook = ModelToDataframe.rename(new_name="Embeddings")
+    Model: ModelBuilder = ModelBuilder(
+        workflows=[
+            PredictTextWorkflow,
+            SGNSPredictWorkflow,
+            EmbeddingsHook,
+            PredictGeometricDistance
+        ]
+    )
+    return Model(input_data=text, return_workflows=return_workflows, return_all=return_all)

dstk/modules/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from .tokenizer import *
+from .text_processor import *
+from .ngrams import *
+from .text_matrix_builder import *
+from .weight_matrix import *
+from .count_models import *
+from .geometric_distance import *
+from .predict_models import *
+from .data_visualization import *

dstk/modules/count_models.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""
+This module offers functionality to transform and reduce high-dimensional text data represented as matrices, enabling more effective downstream analysis and modeling.
+Key features include:
+* Scaling input matrices to zero mean and unit variance using standardization.
+* Generating low-dimensional word embeddings from co-occurrence matrices using dimensionality reduction techniques:
+* Truncated Singular Value Decomposition (SVD)
+* Principal Component Analysis (PCA)
+These techniques help distill semantic information from sparse and high-dimensional co-occurrence data, facilitating tasks such as clustering, visualization, and feature extraction in natural language processing pipelines.
+All functions return results as Pandas DataFrames for seamless integration with data workflows.
+"""
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA, TruncatedSVD
+import pandas as pd
+from ..lib_types import ndarray, DataFrame
+def scale_matrix(matrix: DataFrame, **kwargs) -> DataFrame:
+    """
+    Scales the input matrix to have zero mean and unit variance for each feature.
+    This method applies standardization using scikit-learn's StandardScaler, which transforms the data such that each colum (feature) has a mean of 0 and a standard deviation of 1.
+    :param matrix: The input data to scale.
+    :type matrix: DataFrame
+    :param kwargs: Additional keyword arguments to pass to sklearn's StandardScaler. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
+    :returns: A scaled matrix.
+    :rtype: DataFrame
+    """
+    scaler: StandardScaler = StandardScaler(**kwargs)
+    scaled_matrix: ndarray = scaler.fit_transform(matrix)
+    return pd.DataFrame(scaled_matrix, index=matrix.index, columns=matrix.columns)
+def svd_embeddings(matrix: DataFrame, n_components: int = 100, **kwargs) -> DataFrame:
+    """
+    Generates word embeddings using truncated Single Value Descomposition (SVD).
+    :param matrix: A Co-occurrence matrix from which embeddings will be generated.
+    :type matrix: DataFrame
+    :param n_components: The number of dimensions to reduce the word embeddings to. Defaults to 100.
+    :type n_components: int
+    :param kwargs: Additional keyword arguments to pass to sklearn's TruncatedSVD. Common options include:
+        * **n_components:** Specifies the number of dimensions to reduce the Co-ocurrence matrix to.
+    For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
+    :returns: A DataFrame of word embeddings generated by SVD.
+    :rtype: DataFrame
+    """
+    svd: TruncatedSVD = TruncatedSVD(n_components=n_components, **kwargs)
+    embeddings: ndarray = svd.fit_transform(matrix)
+    shape: int = embeddings.shape[1]
+    columns: list[str] = [f"dim_{num}" for num in range(shape)]
+    return pd.DataFrame(embeddings, index=matrix.index, columns=columns)
+def pca_embeddings(matrix: DataFrame, n_components: int | float = 100, **kwargs) -> DataFrame:
+    """
+    Generates word embeddings using Principal Component Analysis (PCA).
+    :param matrix: A Co-occurrence matrix from which embeddings will be generated.
+    :type matrix: DataFrame
+    :param n_components: If an integer, the number of dimensions to reduce the word embeddings to. If a float between 0 and 1, specifies the proportion of variance to preserve. Defaults to 100.
+    :type n_components: int or float
+    :param kwargs: Additional keyword arguments to pass to sklearn's PCA. Common options include:
+        * **n_components:** If an integer, specifies the number of dimensions to reduce the Co-ocurrence matrix to. If a float, the amount of variance to preserve during PCA.
+    For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
+    :returns: A DataFrame of word embeddings generated by PCA.
+    :rtype: DataFrame
+    """
+    pca: PCA = PCA(n_components=n_components, **kwargs)
+    embeddings: ndarray = pca.fit_transform(matrix)
+    shape: int = embeddings.shape[1]
+    columns: list[str] = [f"dim_{num}" for num in range(shape)]
+    return pd.DataFrame(embeddings, index=matrix.index, columns=columns)

dstk/modules/data_visualization/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .clustering import *
2	+ from .embeddings import *

dstk/modules/data_visualization/clustering.py ADDED Viewed

@@ -0,0 +1,129 @@
+"""
+Clustering utilities for word embeddings analysis and visualization.
+This module provides functions to determine the optimal number of clusters for word embeddings using popular methods such as the Elbow method and Silhouette score. It also assigns cluster labels to the embeddings accordingly.
+Key features:
+* *elbow_method:* Applies the Elbow method on embeddings to find the best cluster count by minimizing inertia.
+* *extract_silhouette_score:* Uses the Silhouette score to evaluate clustering quality and determine the optimal cluster number.
+* Both functions support visualization of their respective metrics and can save plots to file.
+* Cluster labels are appended to the embeddings DataFrame for easy downstream use, such as visualization or further analysis.
+These utilities are designed to work seamlessly with word embedding DataFrames, enabling efficient and interpretable clustering analysis.
+"""
+import plotly.express as px
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+from kneed import KneeLocator
+from ...lib_types import DataFrame, Figure
+def elbow_method(embeddings: DataFrame, max_clusters: int, show: bool = False, path: str | None = None) -> DataFrame:
+    """
+    Applies the Elbow method to determine the optimal number of clusters for word embeddings, and assigns cluster labels based on the identified value.
+    :param embeddings: A dataframe containing the word embeddings.
+    :type embeddings: DataFrame
+    :param max_clusters: The maximum number of clusters to evaluate when applying the Elbow method.
+    :type max_clusters: int
+    :param show: If True, shows the plot. Defaults to False.
+    :type show: bool
+    :param path: If provided, saves the plot in the specified path. Defaults to None.
+    :type path: str
+    :returns: A copy of the input DataFrame with an additional `'cluster'` column containing the cluster labels.
+    :rtype: DataFrame
+    """
+    df: DataFrame = embeddings.copy()
+    means: list[int] = []
+    inertias: list[float] = []
+    for k in range(1, max_clusters):
+        kmeans: KMeans = KMeans(n_clusters=k, random_state=42)
+        kmeans.fit(embeddings)
+        means.append(k)
+        inertias.append(kmeans.inertia_)
+    elbow: KneeLocator = KneeLocator(means, inertias, curve="convex", direction="decreasing")
+    elbow_plot: Figure = px.line(
+        x=means,
+        y=inertias,
+        markers=True,
+        title="Elbow method",
+        labels={
+            "x": "Number of clusters",
+            "y": "Inertia"
+        }
+    )
+    if path:
+        elbow_plot.write_html(path)
+    if show:
+        elbow_plot.show()
+    print(f"The best cluster is {elbow.knee} with an inertia of {elbow.knee_y}")
+    cluster_kmeans: KMeans = KMeans(n_clusters=elbow.knee, random_state=42)
+    df["cluster"] = cluster_kmeans.fit_predict(df)
+    return df
+def extract_silhouette_score(embeddings: DataFrame, max_clusters: int, show: bool = False, path: str | None = None, **kwargs) -> DataFrame:
+    """
+    Extracts the Silhouette score to determine the optimal number of clusters for word embeddings, and assigns cluster labels based on the identified value.
+    :param embeddings: A dataframe containing the word embeddings.
+    :type embeddings: DataFrame
+    :param max_clusters: The maximum number of clusters to evaluate when applying the Elbow method.
+    :type max_clusters: int
+    :param show: If True, shows the plot. Defaults to False.
+    :type show: bool
+    :param path: If provided, saves the plot in the specified path. Defaults to None.
+    :type path: str
+    :param kwargs: Additional keyword arguments to pass to sklearn.metrics silhouette_score. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
+    :returns: A copy of the input DataFrame with an additional `'cluster'` column containing the cluster labels.
+    :rtype: DataFrame
+    """
+    df: DataFrame = embeddings.copy()
+    sil_scores: list[tuple[int, float]] = []
+    for k in range(2, max_clusters):
+        kmeans: KMeans = KMeans(n_clusters=k, random_state=42)
+        kmeans.fit(embeddings)
+        sil_score: float = silhouette_score(embeddings, kmeans.labels_, **kwargs)
+        sil_scores.append((k, sil_score))
+    highest_score: tuple[int, float] = max(sil_scores, key=lambda tup: tup[1])
+    print(f"The best cluster is {highest_score[0]} with a Silhouette score of {highest_score[1]}")
+    cluster_kmeans: KMeans = KMeans(n_clusters=highest_score[0], random_state=42)
+    df["cluster"] = cluster_kmeans.fit_predict(df)
+    clusters, scores = zip(*sil_scores)
+    sil_plot: Figure = px.line(
+        x=clusters,
+        y=scores,
+        markers=True,
+        title="Silhouette Score",
+        labels={
+            "x": "Number of Clusters",
+            "y": "Silhouette Scores"
+        }
+    )
+    if path:
+        sil_plot.write_html(path)
+    if show:
+        sil_plot.show()
+    return df

dstk/modules/data_visualization/embeddings.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""
+Visualization utilities for word embeddings using UMAP dimensionality reduction.
+This module provides a function to project high-dimensional word embeddings into 2D or 3D space for visualization purposes. It uses UMAP to reduce dimensionality while preserving local and global structure, enabling intuitive exploration of semantic relationships between words.
+Key features:
+* Supports 2D and 3D scatter plots of word embeddings.
+* Optionally displays word labels and cluster assignments.
+* Allows customization of UMAP parameters such as number of neighbors, distance metric, and minimum distance.
+* Supports saving interactive Plotly visualizations as HTML files.
+This utility helps linguists, NLP practitioners, and data scientists gain insights from embedding spaces through visual inspection.
+"""
+import plotly.express as px
+from umap import UMAP
+import pandas as pd
+from ...lib_types import ndarray, DataFrame, Figure
+def plot_embeddings(embeddings: DataFrame, n_dimensions: int = 2, labels: bool = False, show: bool = True, path: str | None = None, umap_neighbors: int = 15, umap_metric: str = "cosine", umap_dist: float = 0.1) -> Figure:
+    """
+    Generates a plot of the word embedddings using UMAP for dimensionality reduction.
+    :param embeddings: A dataframe containing the word embeddings.
+    :type embeddings: DataFrame
+    :n_dimensions: The number of dimensions for the plot. Must be 2 or 3 corresponding to a 2D or 3D scatter plot respectively. This also determines the dimensionality UMAP will reduce the embeddings to. Defaults to 2.
+    :type n_dimensions: int
+    :param labels: Whether to show word labels on each point. Defaults to False.
+    :type labels: bool
+    :param show: If True, shows the plot. Defaults to False.
+    :type show: bool
+    :param path: If provided, saves the plot in the specified path. Defaults to None.
+    :type path: str
+    :param umap_neighbors: Controls how UMAP balances local versus global structure. Higher values consider a broader context when reducing dimensions. Defaults to 15.
+    :type umap_neighbors: int
+    :param umap_metric: The distance metric UMAP uses to assess similarity between words (e.g., "cosine", "euclidean"). Defaults to "cosine", which is common for word embeddings.
+    :type umap_metric: str
+    :param umap_dist: Controls how tightly UMAP packs points together. Lower values keep similar words closer in the 2D space. Defaults to 0.1.
+    :type umap_dist: float
+    :return: A Plotly Figure object containing the 2D or 3D scatter plot.
+    :rtype: Figure
+    """
+    if n_dimensions not in (2, 3):
+        raise ValueError("Only 2D or 3D plots are supported (n_dimensions=2 or 3)")
+    reducer: UMAP = UMAP(n_components=n_dimensions, n_neighbors=umap_neighbors, min_dist=umap_dist, metric=umap_metric)
+    umap_embeddings: ndarray = reducer.fit_transform(embeddings)
+    cols: list[str] = [f"Semantic Axis {i+1}" for i in range(n_dimensions)]
+    umap_df = pd.DataFrame(umap_embeddings, index=embeddings.index, columns=cols)
+    if "cluster" in embeddings.columns:
+        umap_df["Cluster"] = embeddings["cluster"]
+    else:
+        umap_df["Cluster"] = "None"
+    umap_df["Word"] = umap_df.index
+    scatter: Figure
+    if n_dimensions == 2:
+        scatter = px.scatter(
+            umap_df,
+            x=cols[0],
+            y=cols[1],
+            color="Cluster",
+            text="Word" if labels else None,
+            hover_data=["Word"] + cols + ["Cluster"],
+            title="2D Projection of word embeddings",
+            color_continuous_scale="Spectral"
+        )
+        scatter.update_traces(textfont_size=10, textposition="top center")
+    else:
+        scatter = scatter = px.scatter_3d(
+            umap_df,
+            x=cols[0],
+            y=cols[1],
+            z=cols[2],
+            color="Cluster",
+            text="Word" if labels else None,
+            hover_data=["Word"] + cols + ["Cluster"],
+            title="3D Projection of word embeddings",
+            color_continuous_scale="Spectral"
+        )
+        scatter.update_traces(textfont_size=14, textposition="top center")
+    if path:
+        scatter.write_html(path)
+    if show:
+        scatter.show()
+    return scatter

dstklib 1.0.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

dstklib 1.0.2py3-none-any.whl → 2.0.1py3-none-any.whl