dstklib 1.0.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. dstk/__init__.py +10 -12
  2. dstk/adaptors/__init__.py +2 -0
  3. dstk/adaptors/adaptors.py +91 -0
  4. dstk/adaptors/typeguards.py +141 -0
  5. dstk/hooks/__init__.py +2 -0
  6. dstk/hooks/hook_tools.py +89 -0
  7. dstk/hooks/type_conversion.py +40 -0
  8. dstk/lib_types/__init__.py +2 -3
  9. dstk/lib_types/dstk_types.py +188 -16
  10. dstk/lib_types/plotly_types.py +1 -0
  11. dstk/method_index.py +32 -0
  12. dstk/models/__init__.py +2 -0
  13. dstk/models/model_tools.py +83 -0
  14. dstk/models/models.py +191 -0
  15. dstk/modules/__init__.py +10 -0
  16. dstk/modules/count_models.py +91 -0
  17. dstk/modules/data_visualization/__init__.py +2 -0
  18. dstk/modules/data_visualization/clustering.py +129 -0
  19. dstk/modules/data_visualization/embeddings.py +101 -0
  20. dstk/modules/geometric_distance.py +114 -0
  21. dstk/modules/ngrams.py +156 -0
  22. dstk/modules/predict_models.py +109 -0
  23. dstk/modules/text_matrix_builder.py +55 -0
  24. dstk/modules/text_processor.py +100 -0
  25. dstk/modules/tokenizer.py +139 -0
  26. dstk/modules/weight_matrix.py +65 -0
  27. dstk/templates/__init__.py +2 -0
  28. dstk/templates/rules.py +59 -0
  29. dstk/templates/templates.py +231 -0
  30. dstk/workflows/__init__.py +2 -0
  31. dstk/workflows/stage_workflows.py +55 -0
  32. dstk/workflows/workflow_tools.py +383 -0
  33. dstklib-2.0.1.dist-info/METADATA +377 -0
  34. dstklib-2.0.1.dist-info/RECORD +43 -0
  35. dstk/collocations.py +0 -121
  36. dstk/count_models.py +0 -112
  37. dstk/geometric_distance.py +0 -107
  38. dstk/lib_types/matplotlib_types.py +0 -4
  39. dstk/lib_types/nltk_types.py +0 -1
  40. dstk/matrix_base.py +0 -113
  41. dstk/pipeline_tools.py +0 -27
  42. dstk/pipelines.py +0 -114
  43. dstk/plot_embeddings.py +0 -240
  44. dstk/predict_models.py +0 -189
  45. dstk/text_matrix_builder.py +0 -87
  46. dstk/text_processor.py +0 -450
  47. dstk/weight_matrix.py +0 -71
  48. dstk/workflow_tools.py +0 -257
  49. dstklib-1.0.2.dist-info/METADATA +0 -369
  50. dstklib-1.0.2.dist-info/RECORD +0 -28
  51. {dstklib-1.0.2.dist-info → dstklib-2.0.1.dist-info}/LICENSE +0 -0
  52. {dstklib-1.0.2.dist-info → dstklib-2.0.1.dist-info}/WHEEL +0 -0
  53. {dstklib-1.0.2.dist-info → dstklib-2.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,83 @@
1
+ """
2
+ Module for orchestrating and automating the execution of multiple workflows and hooks.
3
+
4
+ Provides the ModelBuilder class which manages a sequence of WorkflowBuilder, StageWorkflowBuilder, or Hook instances, allowing flexible, stepwise processing of input data through these workflows.
5
+
6
+ Features:
7
+
8
+ * Sequential execution of workflows with intermediate results.
9
+ * Options to retrieve results from specific workflows, all workflows, or only the final output.
10
+ * Supports integration with various workflow types for modular model construction.
11
+
12
+ This module facilitates building complex processing models by combining and controlling multiple modular workflows in a unified manner.
13
+ """
14
+
15
+ from ..workflows import WorkflowBuilder, StageWorkflowBuilder
16
+ from ..hooks import Hook
17
+
18
+ from typing import Any
19
+ from ..lib_types import StepResult, StepGenerator, ResultGenerator
20
+
21
+ class ModelBuilder:
22
+ """
23
+ Automates the execution of a sequence of workflows on a WorkflowBuilder or Hook subclass.
24
+
25
+ :param workflows: A list of Workflow, StageWorkflows or Hook to execute.
26
+ :type workflows: list[WorkflowBuilder | StageWorkflowBuilder | Hook]
27
+
28
+ Usage:
29
+
30
+ .. code-block:: python
31
+
32
+ CustomModel = ModelBuilder(workflows=[workflow1, workflow2, hook1])
33
+ final_result = CustomModel(input_data)
34
+ """
35
+
36
+ def __init__(self, workflows: list[WorkflowBuilder | StageWorkflowBuilder | Hook]):
37
+ """
38
+ Initializes WorkflowBuilder with given attributes.
39
+ """
40
+
41
+ self.workflows: list[WorkflowBuilder | StageWorkflowBuilder | Hook] = workflows
42
+
43
+ def _run(self, input_data: Any) -> StepGenerator:
44
+ """
45
+ Executes each workflow or hook sequentially on the input data, yielding intermediate results.
46
+
47
+ :param input_data: The initial data to be processed by the workflows.
48
+ :type input_data: Any
49
+
50
+ :return: A generator that yields StepResult objects containing the name of the workflow and the corresponding output after execution.
51
+ :rtype: StepGenerator
52
+ """
53
+ result: Any = input_data
54
+
55
+ for workflow in self.workflows:
56
+ result = workflow(result)
57
+ yield StepResult(name=workflow.name, result=result)
58
+
59
+
60
+ def __call__(self, input_data: Any, return_workflows: list[str] | None = None, return_all: bool = False) -> ResultGenerator | StepGenerator | Any:
61
+ """
62
+ Runs the workflows on the input data.
63
+
64
+ :param input_data: Input data to process.
65
+ :type input_data: Any
66
+ :param return_workflows: If provided, yields results only for these workflows. Defaults to None
67
+ :type return_workflows: list[str] or None
68
+ :param return_all: If True, yields results for all workflows. Defaults to False.
69
+ :type return_all: bool
70
+
71
+ :return: Final result, or a generator of step/workflow results.
72
+ :rtype: ResultGenerator | StepGenerator | Any
73
+ """
74
+
75
+ if return_workflows:
76
+ return (result for name, result in self._run(input_data) if name in return_workflows)
77
+ elif return_all:
78
+ return self._run(input_data)
79
+ else:
80
+ result = input_data
81
+ for _, result in self._run(input_data):
82
+ pass
83
+ return result
dstk/models/models.py ADDED
@@ -0,0 +1,191 @@
1
+ """
2
+ This module contains predefined and commonly used distributional semantic models. Each model is implemented as a high-level pipeline that integrates multiple stages of text processing, embedding generation, and similarity computation.
3
+
4
+ Currently supported models:
5
+
6
+ * *StandardModel*: A count-based model using a context window, PPMI weighting, and dimensionality reduction via SVD. Based on the description found in the book 'Distributional semantics' by Lenci & Sahlgren (2023).
7
+ * *SGNSModel*: A prediction-based model using Word2Vec's Skip-Gram with Negative Sampling (SGNS), as described by Lenci & Sahlgren (2023).
8
+
9
+ These pipelines are modular and composable, built from reusable workflows to support both experimentation and production use.
10
+
11
+ Future versions of this module may include additional models and hybrid approaches.
12
+ """
13
+
14
+ from ..workflows import WorkflowBuilder, TextProcessing, StageWorkflowBuilder, Wrapper
15
+ from ..templates import TextMatrixBuilderTemplate, WeightMatrixTemplate, CountModelsTemplate, GeometricDistanceTemplate, PredictModelsTemplate
16
+ from .model_tools import ModelBuilder
17
+ from ..hooks import ModelToDataframe, Hook
18
+
19
+ from typing import Any
20
+ from ..lib_types import Language, StepResult, StepGenerator, ResultGenerator
21
+
22
+ def StandardModel(text: str, model: str | Language, custom_stop_words: list[str] | None = None, window_size: int = 2, n_components: int = 100, return_workflows: list[str] | None = None, return_all: bool = False)-> ResultGenerator | StepGenerator | Wrapper:
23
+ """
24
+ This pipeline generates word embeddings using the standard model as defined by (Lenci & Sahlgren 97). It preprocesses the text by removing stop words, lowering the words and segmenting the text using a context window. The co-occurrence matrix is weighted with PPMI and reduced with truncated SVD. Then, cosine similarity is appliad as the distance metric.
25
+
26
+ :param text: The text to extract the embeddings from.
27
+ :type text: str
28
+ :param model: The spaCy NLP model to tokenize the text.
29
+ :type model: str or Language
30
+ :param window_size: The size of the context window to segment the text. Defaults to 2.
31
+ :type window_size: int
32
+ :param n_components: The number of dimensions of the embeddings. Defaults to 100.
33
+ :type n_components: int
34
+ :param return_workflows: If provided, yields results only for these workflows. Defaults to None
35
+ :type return_workflows: list[str] or None
36
+ :param return_all: If True, yields results for all workflows. Defaults to False.
37
+ :type return_all: bool
38
+
39
+ :return: Wrapper for cosine_similarity and nearest_neighbors, or a generator of step/workflow results.
40
+ :rtype: ResultGenerator | StepGenerator | Wrapper
41
+ """
42
+
43
+ StandardTextWorkflow: StageWorkflowBuilder = TextProcessing(
44
+ name="ProcessedText",
45
+ workflows= {
46
+ "tokenizer": [
47
+ {"apply_model": {"model": model}},
48
+ {"get_tokens": {}},
49
+ {"remove_stop_words": {"custom_stop_words": custom_stop_words}}
50
+ ],
51
+ "ngrams": [
52
+ {"extract_ngrams": {"window_size": window_size}}
53
+ ],
54
+ "text_processor": [
55
+ {"tokens_to_text": {}},
56
+ {"to_lower": {}},
57
+ {"join": {}}
58
+ ]
59
+ }
60
+ )
61
+
62
+ StandardMatrix: WorkflowBuilder = WorkflowBuilder(
63
+ name="Matrix",
64
+ module_name="text_matrix_builder",
65
+ template=TextMatrixBuilderTemplate,
66
+ workflow=[
67
+ {"create_dtm": {}},
68
+ {"create_co_occurrence_matrix": {}}
69
+ ]
70
+ )
71
+
72
+ StandardWeightMatrix: WorkflowBuilder = WorkflowBuilder(
73
+ name="WeightedMatrix",
74
+ module_name="weight_matrix",
75
+ template=WeightMatrixTemplate,
76
+ workflow=[
77
+ {"pmi": {"positive": True}}
78
+ ]
79
+ )
80
+
81
+ StandardCountModels: WorkflowBuilder = WorkflowBuilder(
82
+ name="Embeddings",
83
+ module_name="count_models",
84
+ template=CountModelsTemplate,
85
+ workflow=[
86
+ {"scale_matrix": {}},
87
+ {"svd_embeddings": {"n_components": n_components}}
88
+ ]
89
+ )
90
+
91
+ StandardGeometricDistance: WorkflowBuilder = WorkflowBuilder(
92
+ name="GeometricDistance",
93
+ module_name="geometric_distance",
94
+ template=GeometricDistanceTemplate,
95
+ workflow=[
96
+ {"cos_similarity": {}},
97
+ {"nearest_neighbors": {}}
98
+ ],
99
+ wrapper=True
100
+ )
101
+
102
+ Model: ModelBuilder = ModelBuilder(
103
+ workflows=[
104
+ StandardTextWorkflow,
105
+ StandardMatrix,
106
+ StandardWeightMatrix,
107
+ StandardCountModels,
108
+ StandardGeometricDistance
109
+ ]
110
+ )
111
+
112
+ return Model(input_data=text, return_workflows=return_workflows, return_all=return_all)
113
+
114
+
115
+ def SGNSModel(text: str, model: str | Language, path: str, return_workflows: list[str] | None = None, return_all: bool = False, **kwargs) -> StepGenerator | ResultGenerator | Any:
116
+ """
117
+ This pipeline generates word embeddings using Skip-Gram with Negative Sampling (SGNS) as defined by (Lenci & Sahlgren 162). It preprocesses the text by extracting the sentences, removing stop words and lowering them. The embeddings are extracted by using word2vec to do SGNS. Then, cosine similarity is appliad as the distance metric.
118
+
119
+ :param text: The text to extract the embeddings from.
120
+ :type text: str
121
+ :param model: The spaCy NLP model to tokenize the text.
122
+ :type model: str or Language
123
+ :param path: The path to save the processed senteces.
124
+ :type path: str
125
+ :param kwargs: Additional keyword arguments to pass to gensim.models.Word2Vec. Common options include:
126
+
127
+ * **vector_size:** Size of the word embedding vectors.
128
+ * **workers:** Number of CPU cores to be used during the training process.
129
+ * **negative:** Specifies how many "noise words" to sample for each positive example during training. Typical values range from 5 to 20. Higher values make training slower but can improve embedding quality.
130
+ * **window (int):** Maximum distance between the current and predicted word.
131
+ * **min_count (int):** Ignores all words with total frequency lower than this.
132
+
133
+ For more information check: https://radimrehurek.com/gensim/models/word2vec.html
134
+ :param return_workflows: If provided, yields results only for these workflows. Defaults to None
135
+ :type return_workflows: list[str] or None
136
+ :param return_all: If True, yields results for all workflows. Defaults to False.
137
+ :type return_all: bool
138
+
139
+ :return: Wrapper for cosine_similarity and nearest_neighbors, or a generator of step/workflow results.
140
+ :rtype: ResultGenerator | StepGenerator | Wrapper
141
+ """
142
+
143
+ PredictTextWorkflow: StageWorkflowBuilder = TextProcessing(
144
+ name="ProcessedText",
145
+ workflows= {
146
+ "tokenizer": [
147
+ {"apply_model": {"model": model}},
148
+ {"get_sentences": {}},
149
+ {"remove_stop_words": {}}
150
+ ],
151
+ "text_processor": [
152
+ {"tokens_to_text": {}},
153
+ {"to_lower": {}},
154
+ {"join": {}},
155
+ {"save_to_file": {"path": path}}
156
+ ]
157
+ }
158
+ )
159
+
160
+ SGNSPredictWorkflow: WorkflowBuilder = WorkflowBuilder(
161
+ name="SGNS",
162
+ module_name="predict_models",
163
+ template=PredictModelsTemplate,
164
+ workflow=[
165
+ {"word2vec": {"sg": 1, **kwargs}}
166
+ ]
167
+ )
168
+
169
+ PredictGeometricDistance: WorkflowBuilder = WorkflowBuilder(
170
+ name="GeometricDistance",
171
+ module_name="geometric_distance",
172
+ template=GeometricDistanceTemplate,
173
+ workflow=[
174
+ {"cos_similarity": {}},
175
+ {"nearest_neighbors": {}}
176
+ ],
177
+ wrapper=True
178
+ )
179
+
180
+ EmbeddingsHook: Hook = ModelToDataframe.rename(new_name="Embeddings")
181
+
182
+ Model: ModelBuilder = ModelBuilder(
183
+ workflows=[
184
+ PredictTextWorkflow,
185
+ SGNSPredictWorkflow,
186
+ EmbeddingsHook,
187
+ PredictGeometricDistance
188
+ ]
189
+ )
190
+
191
+ return Model(input_data=text, return_workflows=return_workflows, return_all=return_all)
@@ -0,0 +1,10 @@
1
+ from .tokenizer import *
2
+ from .text_processor import *
3
+ from .ngrams import *
4
+ from .text_matrix_builder import *
5
+ from .weight_matrix import *
6
+ from .count_models import *
7
+ from .geometric_distance import *
8
+ from .predict_models import *
9
+
10
+ from .data_visualization import *
@@ -0,0 +1,91 @@
1
+ """
2
+ This module offers functionality to transform and reduce high-dimensional text data represented as matrices, enabling more effective downstream analysis and modeling.
3
+
4
+ Key features include:
5
+
6
+ * Scaling input matrices to zero mean and unit variance using standardization.
7
+ * Generating low-dimensional word embeddings from co-occurrence matrices using dimensionality reduction techniques:
8
+ * Truncated Singular Value Decomposition (SVD)
9
+ * Principal Component Analysis (PCA)
10
+
11
+ These techniques help distill semantic information from sparse and high-dimensional co-occurrence data, facilitating tasks such as clustering, visualization, and feature extraction in natural language processing pipelines.
12
+
13
+ All functions return results as Pandas DataFrames for seamless integration with data workflows.
14
+ """
15
+
16
+ from sklearn.preprocessing import StandardScaler
17
+ from sklearn.decomposition import PCA, TruncatedSVD
18
+ import pandas as pd
19
+
20
+ from ..lib_types import ndarray, DataFrame
21
+
22
+ def scale_matrix(matrix: DataFrame, **kwargs) -> DataFrame:
23
+ """
24
+ Scales the input matrix to have zero mean and unit variance for each feature.
25
+
26
+ This method applies standardization using scikit-learn's StandardScaler, which transforms the data such that each colum (feature) has a mean of 0 and a standard deviation of 1.
27
+
28
+ :param matrix: The input data to scale.
29
+ :type matrix: DataFrame
30
+ :param kwargs: Additional keyword arguments to pass to sklearn's StandardScaler. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
31
+
32
+ :returns: A scaled matrix.
33
+ :rtype: DataFrame
34
+ """
35
+
36
+ scaler: StandardScaler = StandardScaler(**kwargs)
37
+ scaled_matrix: ndarray = scaler.fit_transform(matrix)
38
+
39
+ return pd.DataFrame(scaled_matrix, index=matrix.index, columns=matrix.columns)
40
+
41
+ def svd_embeddings(matrix: DataFrame, n_components: int = 100, **kwargs) -> DataFrame:
42
+ """
43
+ Generates word embeddings using truncated Single Value Descomposition (SVD).
44
+
45
+ :param matrix: A Co-occurrence matrix from which embeddings will be generated.
46
+ :type matrix: DataFrame
47
+ :param n_components: The number of dimensions to reduce the word embeddings to. Defaults to 100.
48
+ :type n_components: int
49
+ :param kwargs: Additional keyword arguments to pass to sklearn's TruncatedSVD. Common options include:
50
+
51
+ * **n_components:** Specifies the number of dimensions to reduce the Co-ocurrence matrix to.
52
+
53
+ For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
54
+
55
+ :returns: A DataFrame of word embeddings generated by SVD.
56
+ :rtype: DataFrame
57
+ """
58
+
59
+ svd: TruncatedSVD = TruncatedSVD(n_components=n_components, **kwargs)
60
+ embeddings: ndarray = svd.fit_transform(matrix)
61
+
62
+ shape: int = embeddings.shape[1]
63
+ columns: list[str] = [f"dim_{num}" for num in range(shape)]
64
+
65
+ return pd.DataFrame(embeddings, index=matrix.index, columns=columns)
66
+
67
+ def pca_embeddings(matrix: DataFrame, n_components: int | float = 100, **kwargs) -> DataFrame:
68
+ """
69
+ Generates word embeddings using Principal Component Analysis (PCA).
70
+
71
+ :param matrix: A Co-occurrence matrix from which embeddings will be generated.
72
+ :type matrix: DataFrame
73
+ :param n_components: If an integer, the number of dimensions to reduce the word embeddings to. If a float between 0 and 1, specifies the proportion of variance to preserve. Defaults to 100.
74
+ :type n_components: int or float
75
+ :param kwargs: Additional keyword arguments to pass to sklearn's PCA. Common options include:
76
+
77
+ * **n_components:** If an integer, specifies the number of dimensions to reduce the Co-ocurrence matrix to. If a float, the amount of variance to preserve during PCA.
78
+
79
+ For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
80
+
81
+ :returns: A DataFrame of word embeddings generated by PCA.
82
+ :rtype: DataFrame
83
+ """
84
+
85
+ pca: PCA = PCA(n_components=n_components, **kwargs)
86
+ embeddings: ndarray = pca.fit_transform(matrix)
87
+
88
+ shape: int = embeddings.shape[1]
89
+ columns: list[str] = [f"dim_{num}" for num in range(shape)]
90
+
91
+ return pd.DataFrame(embeddings, index=matrix.index, columns=columns)
@@ -0,0 +1,2 @@
1
+ from .clustering import *
2
+ from .embeddings import *
@@ -0,0 +1,129 @@
1
+ """
2
+ Clustering utilities for word embeddings analysis and visualization.
3
+
4
+ This module provides functions to determine the optimal number of clusters for word embeddings using popular methods such as the Elbow method and Silhouette score. It also assigns cluster labels to the embeddings accordingly.
5
+
6
+ Key features:
7
+
8
+ * *elbow_method:* Applies the Elbow method on embeddings to find the best cluster count by minimizing inertia.
9
+ * *extract_silhouette_score:* Uses the Silhouette score to evaluate clustering quality and determine the optimal cluster number.
10
+ * Both functions support visualization of their respective metrics and can save plots to file.
11
+ * Cluster labels are appended to the embeddings DataFrame for easy downstream use, such as visualization or further analysis.
12
+
13
+ These utilities are designed to work seamlessly with word embedding DataFrames, enabling efficient and interpretable clustering analysis.
14
+ """
15
+
16
+
17
+ import plotly.express as px
18
+ from sklearn.cluster import KMeans
19
+ from sklearn.metrics import silhouette_score
20
+ from kneed import KneeLocator
21
+
22
+ from ...lib_types import DataFrame, Figure
23
+
24
+ def elbow_method(embeddings: DataFrame, max_clusters: int, show: bool = False, path: str | None = None) -> DataFrame:
25
+ """
26
+ Applies the Elbow method to determine the optimal number of clusters for word embeddings, and assigns cluster labels based on the identified value.
27
+
28
+ :param embeddings: A dataframe containing the word embeddings.
29
+ :type embeddings: DataFrame
30
+ :param max_clusters: The maximum number of clusters to evaluate when applying the Elbow method.
31
+ :type max_clusters: int
32
+ :param show: If True, shows the plot. Defaults to False.
33
+ :type show: bool
34
+ :param path: If provided, saves the plot in the specified path. Defaults to None.
35
+ :type path: str
36
+
37
+ :returns: A copy of the input DataFrame with an additional `'cluster'` column containing the cluster labels.
38
+ :rtype: DataFrame
39
+ """
40
+ df: DataFrame = embeddings.copy()
41
+ means: list[int] = []
42
+ inertias: list[float] = []
43
+
44
+ for k in range(1, max_clusters):
45
+ kmeans: KMeans = KMeans(n_clusters=k, random_state=42)
46
+ kmeans.fit(embeddings)
47
+
48
+ means.append(k)
49
+ inertias.append(kmeans.inertia_)
50
+
51
+ elbow: KneeLocator = KneeLocator(means, inertias, curve="convex", direction="decreasing")
52
+
53
+ elbow_plot: Figure = px.line(
54
+ x=means,
55
+ y=inertias,
56
+ markers=True,
57
+ title="Elbow method",
58
+ labels={
59
+ "x": "Number of clusters",
60
+ "y": "Inertia"
61
+ }
62
+ )
63
+
64
+ if path:
65
+ elbow_plot.write_html(path)
66
+
67
+ if show:
68
+ elbow_plot.show()
69
+
70
+ print(f"The best cluster is {elbow.knee} with an inertia of {elbow.knee_y}")
71
+
72
+ cluster_kmeans: KMeans = KMeans(n_clusters=elbow.knee, random_state=42)
73
+ df["cluster"] = cluster_kmeans.fit_predict(df)
74
+
75
+ return df
76
+
77
+ def extract_silhouette_score(embeddings: DataFrame, max_clusters: int, show: bool = False, path: str | None = None, **kwargs) -> DataFrame:
78
+ """
79
+ Extracts the Silhouette score to determine the optimal number of clusters for word embeddings, and assigns cluster labels based on the identified value.
80
+
81
+ :param embeddings: A dataframe containing the word embeddings.
82
+ :type embeddings: DataFrame
83
+ :param max_clusters: The maximum number of clusters to evaluate when applying the Elbow method.
84
+ :type max_clusters: int
85
+ :param show: If True, shows the plot. Defaults to False.
86
+ :type show: bool
87
+ :param path: If provided, saves the plot in the specified path. Defaults to None.
88
+ :type path: str
89
+ :param kwargs: Additional keyword arguments to pass to sklearn.metrics silhouette_score. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
90
+
91
+ :returns: A copy of the input DataFrame with an additional `'cluster'` column containing the cluster labels.
92
+ :rtype: DataFrame
93
+ """
94
+ df: DataFrame = embeddings.copy()
95
+
96
+ sil_scores: list[tuple[int, float]] = []
97
+
98
+ for k in range(2, max_clusters):
99
+ kmeans: KMeans = KMeans(n_clusters=k, random_state=42)
100
+ kmeans.fit(embeddings)
101
+ sil_score: float = silhouette_score(embeddings, kmeans.labels_, **kwargs)
102
+ sil_scores.append((k, sil_score))
103
+
104
+ highest_score: tuple[int, float] = max(sil_scores, key=lambda tup: tup[1])
105
+ print(f"The best cluster is {highest_score[0]} with a Silhouette score of {highest_score[1]}")
106
+
107
+ cluster_kmeans: KMeans = KMeans(n_clusters=highest_score[0], random_state=42)
108
+ df["cluster"] = cluster_kmeans.fit_predict(df)
109
+
110
+ clusters, scores = zip(*sil_scores)
111
+
112
+ sil_plot: Figure = px.line(
113
+ x=clusters,
114
+ y=scores,
115
+ markers=True,
116
+ title="Silhouette Score",
117
+ labels={
118
+ "x": "Number of Clusters",
119
+ "y": "Silhouette Scores"
120
+ }
121
+ )
122
+
123
+ if path:
124
+ sil_plot.write_html(path)
125
+
126
+ if show:
127
+ sil_plot.show()
128
+
129
+ return df
@@ -0,0 +1,101 @@
1
+ """
2
+ Visualization utilities for word embeddings using UMAP dimensionality reduction.
3
+
4
+ This module provides a function to project high-dimensional word embeddings into 2D or 3D space for visualization purposes. It uses UMAP to reduce dimensionality while preserving local and global structure, enabling intuitive exploration of semantic relationships between words.
5
+
6
+ Key features:
7
+
8
+ * Supports 2D and 3D scatter plots of word embeddings.
9
+ * Optionally displays word labels and cluster assignments.
10
+ * Allows customization of UMAP parameters such as number of neighbors, distance metric, and minimum distance.
11
+ * Supports saving interactive Plotly visualizations as HTML files.
12
+
13
+ This utility helps linguists, NLP practitioners, and data scientists gain insights from embedding spaces through visual inspection.
14
+ """
15
+
16
+ import plotly.express as px
17
+ from umap import UMAP
18
+ import pandas as pd
19
+
20
+ from ...lib_types import ndarray, DataFrame, Figure
21
+
22
+
23
+ def plot_embeddings(embeddings: DataFrame, n_dimensions: int = 2, labels: bool = False, show: bool = True, path: str | None = None, umap_neighbors: int = 15, umap_metric: str = "cosine", umap_dist: float = 0.1) -> Figure:
24
+ """
25
+ Generates a plot of the word embedddings using UMAP for dimensionality reduction.
26
+
27
+ :param embeddings: A dataframe containing the word embeddings.
28
+ :type embeddings: DataFrame
29
+ :n_dimensions: The number of dimensions for the plot. Must be 2 or 3 corresponding to a 2D or 3D scatter plot respectively. This also determines the dimensionality UMAP will reduce the embeddings to. Defaults to 2.
30
+ :type n_dimensions: int
31
+ :param labels: Whether to show word labels on each point. Defaults to False.
32
+ :type labels: bool
33
+ :param show: If True, shows the plot. Defaults to False.
34
+ :type show: bool
35
+ :param path: If provided, saves the plot in the specified path. Defaults to None.
36
+ :type path: str
37
+ :param umap_neighbors: Controls how UMAP balances local versus global structure. Higher values consider a broader context when reducing dimensions. Defaults to 15.
38
+ :type umap_neighbors: int
39
+ :param umap_metric: The distance metric UMAP uses to assess similarity between words (e.g., "cosine", "euclidean"). Defaults to "cosine", which is common for word embeddings.
40
+ :type umap_metric: str
41
+ :param umap_dist: Controls how tightly UMAP packs points together. Lower values keep similar words closer in the 2D space. Defaults to 0.1.
42
+ :type umap_dist: float
43
+
44
+ :return: A Plotly Figure object containing the 2D or 3D scatter plot.
45
+ :rtype: Figure
46
+ """
47
+
48
+ if n_dimensions not in (2, 3):
49
+ raise ValueError("Only 2D or 3D plots are supported (n_dimensions=2 or 3)")
50
+
51
+ reducer: UMAP = UMAP(n_components=n_dimensions, n_neighbors=umap_neighbors, min_dist=umap_dist, metric=umap_metric)
52
+ umap_embeddings: ndarray = reducer.fit_transform(embeddings)
53
+
54
+ cols: list[str] = [f"Semantic Axis {i+1}" for i in range(n_dimensions)]
55
+
56
+ umap_df = pd.DataFrame(umap_embeddings, index=embeddings.index, columns=cols)
57
+
58
+ if "cluster" in embeddings.columns:
59
+ umap_df["Cluster"] = embeddings["cluster"]
60
+ else:
61
+ umap_df["Cluster"] = "None"
62
+
63
+ umap_df["Word"] = umap_df.index
64
+
65
+ scatter: Figure
66
+
67
+ if n_dimensions == 2:
68
+ scatter = px.scatter(
69
+ umap_df,
70
+ x=cols[0],
71
+ y=cols[1],
72
+ color="Cluster",
73
+ text="Word" if labels else None,
74
+ hover_data=["Word"] + cols + ["Cluster"],
75
+ title="2D Projection of word embeddings",
76
+ color_continuous_scale="Spectral"
77
+ )
78
+
79
+ scatter.update_traces(textfont_size=10, textposition="top center")
80
+ else:
81
+ scatter = scatter = px.scatter_3d(
82
+ umap_df,
83
+ x=cols[0],
84
+ y=cols[1],
85
+ z=cols[2],
86
+ color="Cluster",
87
+ text="Word" if labels else None,
88
+ hover_data=["Word"] + cols + ["Cluster"],
89
+ title="3D Projection of word embeddings",
90
+ color_continuous_scale="Spectral"
91
+ )
92
+
93
+ scatter.update_traces(textfont_size=14, textposition="top center")
94
+
95
+ if path:
96
+ scatter.write_html(path)
97
+
98
+ if show:
99
+ scatter.show()
100
+
101
+ return scatter