dstklib 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dstk/pipelines.py ADDED
@@ -0,0 +1,114 @@
1
+ from .workflow_tools import WorkflowBuilder
2
+ from .pipeline_tools import PipelineBuilder
3
+ from dstk.lib_types import Language, DataFrame
4
+
5
+ from .text_processor import TextProcessor
6
+ from .text_matrix_builder import TextMatrixBuilder
7
+ from .weight_matrix import WeightMatrix
8
+ from .count_models import CountModels
9
+ from .geometric_distance import GeometricDistance
10
+ from .predict_models import PredictModels
11
+
12
+ def StandardModel(text: str, model: str | Language, window_size: int = 2, components: int = 100) -> DataFrame:
13
+ """
14
+ This pipeline generates word embeddings using the standard model as defined by (Lenci & Sahlgren 97). It preprocesses the text by removing stop words, lowering the words and segmenting the text using a context window. The co-occurrence matrix is weighted with PPMI and reduced with truncated SVD.
15
+
16
+ :param text: The text to extract the embeddings from.
17
+ :param model: The spaCy NLP model to tokenize the text.
18
+ :param window_size: The size of the context window to segment the text. Defaults to 2.
19
+ :param components: The number of dimensions of the embeddings. Defaults to 100.
20
+ """
21
+
22
+ StandardTextWorkflow: WorkflowBuilder = WorkflowBuilder(
23
+ work_class=TextProcessor,
24
+ method_representation={
25
+ "set_model": {"model": model},
26
+ "get_tokens": {},
27
+ "remove_stop_words": {},
28
+ "get_text": {"lemmatize": True},
29
+ "to_lower": {},
30
+ "corpus_by_context_window": {"window_size": window_size}
31
+ }
32
+ )
33
+
34
+ StandardMatrix: WorkflowBuilder = WorkflowBuilder(
35
+ work_class=TextMatrixBuilder,
36
+ method_representation={
37
+ "create_dtm": {},
38
+ "create_co_occurrence_matrix": {},
39
+ "to_dataframe": {}
40
+ }
41
+ )
42
+
43
+ StandardWeightMatrix: WorkflowBuilder = WorkflowBuilder(
44
+ work_class=WeightMatrix,
45
+ method_representation={
46
+ "pmi": {"positive": True}
47
+ }
48
+ )
49
+
50
+ StandardCountModels: WorkflowBuilder = WorkflowBuilder(
51
+ work_class=CountModels,
52
+ method_representation={
53
+ "scale_matrix": {},
54
+ "svd_embeddings": {"n_components": components},
55
+ "to_dataframe": {}
56
+ }
57
+ )
58
+
59
+ Model: PipelineBuilder = PipelineBuilder([
60
+ StandardTextWorkflow,
61
+ StandardMatrix,
62
+ StandardWeightMatrix,
63
+ StandardCountModels
64
+ ])
65
+
66
+ return Model(text=text)
67
+
68
+ def SGNSModel(text: str, model: str | Language, path: str, **kwargs) -> PredictModels:
69
+ """
70
+ This pipeline generates word embeddings using Skip-Gram with Negative Sampling (SGNS) as defined by (Lenci & Sahlgren 162). It preprocesses the text by extracting the sentences, removing stop words and lowering them. The embeddings are extracted by using word2vec to do SGNS. Returns an instance of PredictModels.
71
+
72
+ :param text: The text to extract the embeddings from.
73
+ :param model: The spaCy NLP model to tokenize the text.
74
+ :param path: The path to save the processed senteces.
75
+ :param kwargs: Additional keyword arguments to pass to gensim.models.Word2Vec. Common options include:
76
+ - vector_size: Size of the word embedding vectors.
77
+ - workers: Number of CPU cores to be used during the training process.
78
+ - negative: Specifies how many "noise words" to sample for each positive example during training. Typical values range from 5 to 20. Higher values make training slower but can improve embedding quality.
79
+ - window (int): Maximum distance between the current and predicted word.
80
+ - min_count (int): Ignores all words with total frequency lower than this.
81
+
82
+ For more information check: https://radimrehurek.com/gensim/models/word2vec.html
83
+ """
84
+
85
+ SGNSTextWorkflow: WorkflowBuilder = WorkflowBuilder(
86
+ work_class=TextProcessor,
87
+ method_representation={
88
+ "set_model": {"model": model},
89
+ "get_sentences": {},
90
+ "remove_stop_words": {},
91
+ "get_text": {"lemmatize": True},
92
+ "to_lower": {},
93
+ "join": {},
94
+ "save_to_file": {"path": path}
95
+ }
96
+ )
97
+
98
+ SGNSPredictWorkflow: WorkflowBuilder = WorkflowBuilder(
99
+ work_class=PredictModels,
100
+ method_representation={
101
+ "word2vec": {
102
+ "sg": 1,
103
+ **kwargs
104
+ }
105
+ },
106
+ result=False
107
+ )
108
+
109
+ Model: PipelineBuilder = PipelineBuilder([
110
+ SGNSTextWorkflow,
111
+ SGNSPredictWorkflow
112
+ ])
113
+
114
+ return Model(text=text)
@@ -0,0 +1,240 @@
1
+ import matplotlib.pyplot as plt
2
+ from sklearn.cluster import KMeans
3
+ from sklearn.metrics import silhouette_score
4
+ from kneed import KneeLocator
5
+ from umap import UMAP
6
+ from .workflow_tools import requires, workflow, WorkflowManager
7
+ from .matrix_base import MatrixRepresentation, accept_matrix_representation
8
+
9
+ from .lib_types import ndarray, PathCollection, Axes, Axes3D, Labels
10
+
11
+ STAGES = [
12
+ "start", # The embeddings matrix representation
13
+ "clusters", # The ideal number of clusters to be plotted
14
+ "end" # The result of the plot
15
+ ]
16
+
17
+ # Maybe return the interias and the highest score?
18
+ class PlotEmbeddings(WorkflowManager):
19
+ """
20
+ Provides a set of methods to visualize word embeddings in both 2D and 3D.
21
+
22
+ :param embeddings: A matrix representation of word embeddings.
23
+ """
24
+
25
+ def __init__(self, embeddings: MatrixRepresentation | None = None):
26
+ """
27
+ Initializes PlotEmbeddings with given attributes.
28
+ """
29
+
30
+ super().__init__()
31
+
32
+ # Stages
33
+
34
+ self._clusters: tuple[int, float]
35
+
36
+ self._set_workflow(input_arg=embeddings)
37
+
38
+ @requires(stages=["start"])
39
+ @workflow(input_arg="matrix", input_process="_start", output_process="_clusters", next_stage="clusters")
40
+ @accept_matrix_representation(meta="n_clusters")
41
+ def elbow_analysis(self, *, matrix: ndarray, max_k: int, show: bool = False, path: str | None = None) -> int:
42
+ """
43
+ Generates an Elbow plot to help determine the optimal number of clusters for the word embeddings. Returns the best number of clusters.
44
+
45
+ :param matrix: An array to be clustered.
46
+ :param max_k: The maximum number of clusters to evaluate when applying the Elbow method.
47
+ :param show: If True, shows the plot. Defaults to False.
48
+ :param path: If provided, saves the plot in the specified path. Defaults to None.
49
+
50
+ This method supports different matrix forms due to decorator-based preprocessing:
51
+ - matrix: ndarray
52
+ - matrix representation: MatrixRepresentation
53
+ """
54
+ plt.clf()
55
+ plt.close("all")
56
+
57
+ means: list[int] = []
58
+ inertias: list[float] = []
59
+
60
+ for k in range(1, max_k):
61
+ kmeans: KMeans = KMeans(n_clusters=k, random_state=42)
62
+ kmeans.fit(matrix)
63
+
64
+ means.append(k)
65
+ inertias.append(kmeans.inertia_)
66
+
67
+ elbow: KneeLocator = KneeLocator(means, inertias, curve="convex", direction="decreasing")
68
+
69
+ print(f"The best cluster is {elbow.knee} with an inertia of {elbow.knee_y}")
70
+
71
+ elbow.plot_knee()
72
+
73
+ if path:
74
+ plt.savefig(path)
75
+
76
+ if show:
77
+ plt.show()
78
+
79
+ return elbow.knee
80
+
81
+ @requires(stages=["start"])
82
+ @workflow(input_arg="matrix", input_process="_start", output_process="_clusters", next_stage="clusters")
83
+ @accept_matrix_representation(meta="n_clusters")
84
+ def extract_silhouette_score(self, *, matrix: ndarray, max_k: int, show: bool = False, path: str | None = None, **kwargs) -> int:
85
+ """
86
+ Extracts and plots the Silhouette score to help determine the optimal number of clusters for the word embeddings. Returns the best number of clusters based on the highest score.
87
+
88
+ :param matrix: An array to be clustered.
89
+ :param max_k: The maximum number of clusters to evaluate when computing the Silhouette score.
90
+ :param show: If True, shows the plot. Defaults to False.
91
+ :param path: If provided, saves the plot in the specified path. Defaults to None.
92
+ :param kwargs: Additional keyword arguments to pass to sklearn.metrics silhouette_score. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
93
+
94
+ This method supports different matrix forms due to decorator-based preprocessing:
95
+ - matrix: ndarray
96
+ - matrix representation: MatrixRepresentation
97
+ """
98
+ plt.clf()
99
+ plt.close("all")
100
+
101
+ sil_scores: list[tuple[int, float]] = []
102
+
103
+ for k in range(2, max_k):
104
+ kmeans: KMeans = KMeans(n_clusters=k, random_state=42)
105
+ kmeans.fit(matrix)
106
+ sil_score: float = silhouette_score(matrix, kmeans.labels_)
107
+ sil_scores.append((k, sil_score), **kwargs)
108
+
109
+ highest_score: tuple[int, float] = max(sil_scores, key=lambda tup: tup[1])
110
+ print(f"The best cluster is {highest_score[0]} with a Silhouette score of {highest_score[1]}")
111
+
112
+ clusters, scores = zip(*sil_scores)
113
+
114
+ plt.plot(clusters, scores, 'o-')
115
+
116
+ plt.xlabel("Number of Clusters")
117
+ plt.ylabel("Silhouette Scores")
118
+ plt.title("Silhouette Score Plot")
119
+ plt.grid(True)
120
+
121
+ if path:
122
+ plt.savefig(path)
123
+
124
+ if show:
125
+ plt.show()
126
+
127
+ return highest_score[0]
128
+
129
+ @requires(stages=["clusters"])
130
+ @workflow(input_arg="embeddings", input_attrs={"embeddings": None, "n_clusters": {"meta": "n_clusters"}}, input_process="_clusters", output_process="_end", next_stage="end")
131
+ def plot_embeddings_2D(self, *, embeddings: MatrixRepresentation, n_clusters: int = 1, alpha: float = 0.8, font_size: int = 8, grid: bool = True, show: bool = True, path: str | None = None, umap_neighbors: int = 15, umap_metric: str = "cosine", umap_dist: float = 0.1) -> PathCollection:
132
+ """
133
+ Generates a 2D plot of the word embedddings using UMAP for dimensionality reduction.
134
+
135
+ :param embeddings: A matrix representation of word embeddings.
136
+ :param n_clusters: The number of clusters to form when grouping the word embeddings. Defaluts to 1.
137
+ :param alpha: The transparency level (alpha) of the plotted dots, between 0 (fully transparent) and 1 (fully opaque). Defaults to 0.8.
138
+ :param font_size: Specifies the font size for the labels of the plotted dots. Defaults to 8.
139
+ :param grid: If True, displays a grid on the plot; if False, hides it. Defaults to True.
140
+ :param show: If True, shows the plot. Defaults to True.
141
+ :param path: If provided, saves the plot in the specified path. Defaults to None.
142
+ :param umap_neighbors: Controls how UMAP balances local versus global structure. Higher values consider a broader context when reducing dimensions. Defaults to 15.
143
+ :param umap_metric: The distance metric UMAP uses to assess similarity between words (e.g., "cosine", "euclidean"). Defaults to "cosine", which is common for word embeddings.
144
+ :param umap_dist: Controls how tightly UMAP packs points together. Lower values keep similar words closer in the 2D space. Defaults to 0.1.
145
+
146
+ :return: A matplotlib PathCollection object representing the plotted points.
147
+ """
148
+
149
+ plt.clf()
150
+ plt.close("all")
151
+
152
+ embeddings_matrix: ndarray = embeddings.matrix
153
+ labels: Labels = embeddings.rows
154
+
155
+ reducer: UMAP = UMAP(n_components=2, n_neighbors=umap_neighbors, min_dist=umap_dist, metric=umap_metric)
156
+ umap_embeddings: ndarray = reducer.fit_transform(embeddings_matrix)
157
+
158
+ kmeans: KMeans = KMeans(n_clusters=n_clusters, random_state=42)
159
+ clusters: ndarray = kmeans.fit_predict(umap_embeddings)
160
+ scatter: PathCollection = plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=clusters, cmap='Spectral', alpha=alpha)
161
+
162
+ if labels is not None:
163
+ for index, label in enumerate(labels):
164
+ plt.annotate(label, (umap_embeddings[index, 0], umap_embeddings[index, 1]), fontsize=font_size, alpha=0.6)
165
+ else:
166
+ raise ValueError("Rows of embeddings are None. Make sure your matrix contains labeled rows.")
167
+
168
+ plt.xlabel("Axis 1")
169
+ plt.ylabel("Axis 2")
170
+ plt.title("Projection of word embeddings")
171
+ plt.colorbar(scatter, label="Cluster")
172
+ plt.grid(grid)
173
+ plt.tight_layout()
174
+
175
+ if path:
176
+ plt.savefig(path)
177
+
178
+ if show:
179
+ plt.show()
180
+
181
+ return scatter
182
+
183
+ @requires(stages=["clusters"])
184
+ @workflow(input_arg="embeddings", input_attrs={"embeddings": None, "n_clusters": {"meta": "n_clusters"}}, input_process="_clusters", output_process="_end", next_stage="end")
185
+ def plot_embeddings_3D(self, *, embeddings: MatrixRepresentation, n_clusters: int = 1, alpha: float = 0.8, font_size: int = 8, grid: bool = True, show: bool = True, path: str | None = None, umap_neighbors: int = 15, umap_metric: str = "cosine", umap_dist: float = 0.1) -> PathCollection:
186
+ """
187
+ Generates a 3D plot of the word embedddings.
188
+
189
+ :param embeddings: A matrix representation of word embeddings.
190
+ :param n_clusters: The number of clusters to form when grouping the word embeddings. Defaults to 1.
191
+ :param alpha: The transparency level (alpha) of the plotted dots, between 0 (fully transparent) and 1 (fully opaque). Defaults to 0.8.
192
+ :param font_size: Specifies the font size for the labels of the plotted dots. Defaults to 8.
193
+ :param grid: If True, displays a grid on the plot; if False, hides it. Defaults to True.
194
+ :param show: If True, shows the plot. Defaults to True.
195
+ :param path: If provided, saves the plot in the specified path. Defaults to None.
196
+ :param umap_neighbors: Controls how UMAP balances local versus global structure. Higher values consider a broader context when reducing dimensions. Defaults to 15.
197
+ :param umap_metric: The distance metric UMAP uses to assess similarity between words (e.g., "cosine", "euclidean"). Defaults to "cosine", which is common for word embeddings.
198
+ :param umap_dist: Controls how tightly UMAP packs points together. Lower values keep similar words closer in the 2D space. Defaults to 0.1.
199
+
200
+ :return: A matplotlib PathCollection object representing the plotted points.
201
+ """
202
+ plt.clf()
203
+ plt.close("all")
204
+
205
+ embeddings_matrix: ndarray = embeddings.matrix
206
+ labels: Labels = embeddings.rows
207
+
208
+ reducer: UMAP = UMAP(n_components=3, n_neighbors=umap_neighbors, min_dist=umap_dist, metric=umap_metric)
209
+ umap_embeddings: ndarray = reducer.fit_transform(embeddings_matrix)
210
+
211
+ kmeans: KMeans = KMeans(n_clusters=n_clusters, random_state=42)
212
+ clusters = kmeans.fit_predict(umap_embeddings)
213
+
214
+ ax: Axes3D = plt.axes(projection="3d")
215
+
216
+ scatter: PathCollection = ax.scatter(umap_embeddings
217
+ [:, 0], umap_embeddings[:, 1], umap_embeddings[:, 2], c=clusters, cmap="Spectral", alpha=alpha)
218
+
219
+ if labels is not None:
220
+ for index, label in enumerate(labels):
221
+ ax.text(umap_embeddings[index, 0], umap_embeddings[index, 1], umap_embeddings[index, 2], label, size=font_size, alpha=0.6)
222
+ else:
223
+ raise ValueError("Rows of embeddings are None. Make sure your matrix contains labeled rows.")
224
+
225
+ ax.set_xlabel("Axis 1")
226
+ ax.set_ylabel("Axis 2")
227
+ ax.set_zlabel("Axis 3")
228
+ ax.set_title("3D projection of word embeddings")
229
+
230
+ plt.colorbar(scatter, label="Cluster")
231
+ plt.grid(grid)
232
+ plt.tight_layout()
233
+
234
+ if path:
235
+ plt.savefig(path)
236
+
237
+ if show:
238
+ plt.show()
239
+
240
+ return scatter
dstk/predict_models.py ADDED
@@ -0,0 +1,189 @@
1
+ from gensim.models import Word2Vec
2
+ import fasttext
3
+ import numpy as np
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from pathlib import Path
6
+ from .workflow_tools import requires, workflow, WorkflowManager
7
+ from .matrix_base import MatrixRepresentation
8
+
9
+ from .lib_types import ndarray, FastText
10
+
11
+ # The workflow needs to be completely restructured
12
+
13
+ STAGES = [
14
+ "start", # The path to the sentences file
15
+ "predict_model", # After a model has been generated,
16
+ "embeddings_operations" # After a metric distance has ben applied
17
+ "end" # After the model was saved or the model was turned into matrix representation
18
+ ]
19
+
20
+ class PredictModels(WorkflowManager):
21
+ """
22
+ Provides a unified interface to work seamlessly with Gensim's Word2Vec and Facebook's FastText models.
23
+
24
+ This class simplifies the process of training, loading, and using word embeddings by integrating both popular algorithms under a single API.
25
+
26
+ :param path: The path to a file conatining a list of sentences or collocations from which to build word embeddings.
27
+ """
28
+
29
+ _start: str
30
+ _end: str | MatrixRepresentation
31
+
32
+ def __init__(self, path: str | None = None):
33
+ """
34
+ Initializes PredictModels with given attributes.
35
+ """
36
+
37
+ super().__init__()
38
+
39
+ # Stages
40
+
41
+ self._predict_model: Word2Vec | FastText
42
+ self._embeddings_operations: list[tuple[str,float]] | float
43
+
44
+ self._set_workflow(input_arg=path)
45
+
46
+ @requires(stages=["start"])
47
+ @workflow(input_arg="path", input_process="_start", output_process="_predict_model", next_stage="predict_model")
48
+ def word2vec(self, *, path: str, **kwargs) -> Word2Vec:
49
+ """
50
+ Creates word embeddings using the Word2Vec algorithm.
51
+
52
+ :param path: The path to a file conatining a list of sentences or collocations from which to build word embeddings.
53
+ :param kwargs: Additional keyword arguments to pass to gensim.models.Word2Vec. Common options include:
54
+ - vector_size: Size of the word embedding vectors.
55
+ - workers: Number of CPU cores to be used during the training process.
56
+ - sg: Training algorithm. 1 for skip-gram; 0 for CBOW (Continuous Bag of Words).
57
+ - window (int): Maximum distance between the current and predicted word.
58
+ - min_count (int): Ignores all words with total frequency lower than this.
59
+
60
+ For more information check: https://radimrehurek.com/gensim/models/word2vec.html
61
+ """
62
+
63
+ return Word2Vec(
64
+ corpus_file=path,
65
+ **kwargs
66
+ )
67
+
68
+ @requires(stages=["start"])
69
+ @workflow(input_arg="path", input_process="_start", output_process="_predict_model", next_stage="predict_model")
70
+ def fastText(self, *, path: str, **kwargs) -> FastText:
71
+ """
72
+ Creates word embeddings using the FastText algorithm.
73
+
74
+ :param path: The path to a file conatining a list of sentences or collocations from which to build word embeddings.
75
+ :param kwargs: Additional keyword arguments to pass to fasttext.train_unsupervised.
76
+ Common options include:
77
+ - dim: Size of the word embedding vectors.
78
+ - model: Training algorithm: skipgram or cbow (Continuous Bag of Words)
79
+ - thread: Number of CPU cores to be used during the training process.
80
+
81
+ For more information check: https://fasttext.cc/docs/en/options.html
82
+ """
83
+
84
+ return fasttext.train_unsupervised(
85
+ path,
86
+ **kwargs
87
+ )
88
+
89
+ @requires(stages=["start"])
90
+ @workflow(input_arg="path", input_process="_start", output_process="_predict_model", next_stage="predict_model")
91
+ def load_model(self, *, path: str) -> Word2Vec | FastText:
92
+ """
93
+ Loads the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.
94
+
95
+ :param path: Path to the saved model file.
96
+ """
97
+
98
+ extension: str = Path(path).suffix.lower()
99
+
100
+ if extension == ".model":
101
+ return Word2Vec.load(path)
102
+ elif extension == ".bin":
103
+ return fasttext.load_model(path)
104
+ else:
105
+ raise ValueError(f"Model extension {extension} not recognized.")
106
+
107
+ @requires(stages=["predict_model", "embeddings_operations"])
108
+ @workflow(input_arg="model", input_process="_predict_model", output_process="_end", next_stage="end")
109
+ def save_model(self, *, model: Word2Vec | FastText, path: str) -> str:
110
+ """
111
+ Saves the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.
112
+
113
+ :param model: A trained Word2Vec or FastText model.
114
+ :param path: The path (without extension) where to save the model.
115
+ """
116
+ full_path: Path = Path(path)
117
+
118
+ if isinstance(model, Word2Vec):
119
+ model.save(str(full_path.with_suffix(".model")))
120
+ elif isinstance(model, FastText):
121
+ model.save_model(str(full_path.with_suffix(".bin")))
122
+ else:
123
+ raise NotImplementedError(f"Model identifier type {type(model.__name__)} not yet supported")
124
+
125
+ return str(full_path.resolve())
126
+
127
+ @requires(stages=["predict_model", "embeddings_operations"], multiple_calls=True)
128
+ @workflow(input_arg="model", input_process="_predict_model", output_process="_embeddings_operations", next_stage="embeddings_operations")
129
+ def nearest_neighbors(self, *, model: Word2Vec | FastText, word: str, n_neighbors: int) -> list[tuple[str,float]]:
130
+ """
131
+ Returns the top N most semantically similar words to a given target word.
132
+
133
+ :param model: A trained Word2Vec or FastText model.
134
+ :param word: The target word to find neighbors for.
135
+ :param n_neighbors: Number of nearest neighbors to return.
136
+ """
137
+
138
+ if isinstance(model, Word2Vec):
139
+ return model.wv.most_similar(word, topn=n_neighbors)
140
+ elif isinstance(model, FastText):
141
+ result: list[tuple[float, str]] = model.get_nearest_neighbors(word, k=n_neighbors)
142
+ return [(word, score) for score, word in result]
143
+ else:
144
+ raise NotImplementedError(f"Model identifier type {type(model.__name__)} not yet supported")
145
+
146
+ @requires(stages=["predict_model", "embeddings_operations"], multiple_calls=True)
147
+ @workflow(input_arg="model", input_process="_predict_model", output_process="_embeddings_operations", next_stage="embeddings_operations")
148
+ def cos_similarity(self, *, model: Word2Vec | FastText, first_word: str, second_word: str) -> float:
149
+ """
150
+ Computes the cosine similarity between the embeddings of two words.
151
+
152
+ :param model: A trained Word2Vec or FastText model.
153
+ :param first_word: The first word in the pair.
154
+ :param second_word: The second word in the pair.
155
+ """
156
+
157
+ if isinstance(model, Word2Vec):
158
+ return float(model.wv.similarity(first_word, second_word))
159
+ elif isinstance(model, FastText):
160
+ first_word_vector: ndarray = np.array(model[first_word]).reshape(1, -1)
161
+ second_word_vector: ndarray = np.array(model[second_word]).reshape(1, -1)
162
+
163
+ cos_sim: ndarray = cosine_similarity(first_word_vector, second_word_vector)
164
+
165
+ return float(cos_sim[0][0])
166
+ else:
167
+ raise NotImplementedError(f"Model identifier type {model.__name__} not yet supported")
168
+
169
+ @requires(stages=["predict_model"])
170
+ @workflow(input_arg="model", input_process="_predict_model", output_process="_end", next_stage="end")
171
+ def to_matrix(self, *, model: Word2Vec | FastText) -> MatrixRepresentation:
172
+ """
173
+ Returns a matrix represenation of the word embeddings and their associated labels.
174
+
175
+ :param model: A trained Word2Vec or FastText model.
176
+ """
177
+
178
+ word_vectors: ndarray
179
+ labels: list[str]
180
+
181
+ if isinstance(model, Word2Vec):
182
+ word_vectors = model.wv[model.wv.index_to_key]
183
+ labels = list(model.wv.index_to_key)
184
+ elif isinstance(model, FastText):
185
+ words: list[str] = model.words
186
+ word_vectors = np.array([model[word] for word in words])
187
+ labels = words
188
+
189
+ return MatrixRepresentation(matrix=word_vectors, rows=labels)
@@ -0,0 +1,87 @@
1
+ import pandas as pd
2
+ from sklearn.feature_extraction.text import CountVectorizer
3
+ import numpy as np
4
+ from .workflow_tools import requires, workflow, WorkflowManager
5
+ from .matrix_base import MatrixRepresentation, matrix_to_dataframe
6
+
7
+ from .lib_types import csr_matrix, ndarray, DataFrame, csc_matrix, Matrix, Labels
8
+
9
+ STAGES = [
10
+ "start", # Before any processing
11
+ "matrix_operations" # Matrix operations and transformations
12
+ "end" # Dataframe operations
13
+ ]
14
+
15
+ class TextMatrixBuilder(WorkflowManager):
16
+ """
17
+ Creates a Document Term Matrix, a Co-ocurrence Matrix, and dataframes from them.
18
+
19
+ :param corpus: A list of sentences or collocations from which to build a matrix.
20
+ """
21
+
22
+ _start: list[str]
23
+ _end: DataFrame
24
+
25
+ def __init__(self, corpus: list[str] | None = None):
26
+ """
27
+ Initializes TextMatrixBuilder with given attributes.
28
+ """
29
+
30
+ super().__init__()
31
+
32
+ self._matrix_operations: MatrixRepresentation
33
+
34
+ self._set_workflow(input_arg=corpus)
35
+
36
+ @requires(stages=["start"])
37
+ @workflow(input_arg="corpus", input_process="_start", output_process="_matrix_operations", next_stage="matrix_operations")
38
+ def create_dtm(self, *, corpus: list[str], **kwargs) -> MatrixRepresentation:
39
+ """
40
+ Creates Document Term Matrix (DTM).
41
+
42
+ :param corpus: A list of sentences or collocations from which to build a matrix.
43
+ :param kwargs: Additional keyword arguments to pass to sklearn's CountVectorizer.
44
+ Common options include:
45
+ - stop_words: If provided, a list of stopwords to remove from the corpus.
46
+ - ngram_range: A tuple (min_n, max_n) specifying the range of n-grams to consider.
47
+ For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
48
+ """
49
+
50
+ vectorizer: CountVectorizer = CountVectorizer(**kwargs)
51
+
52
+ dtm: csr_matrix = vectorizer.fit_transform(corpus)
53
+
54
+ return MatrixRepresentation(
55
+ matrix=dtm,
56
+ rows=np.array(corpus),
57
+ columns=vectorizer.get_feature_names_out()
58
+ )
59
+
60
+ @requires(stages=["matrix_operations"])
61
+ @workflow(input_arg="matrix", input_attrs={"matrix": "matrix", "rows": "columns", "columns": "columns"}, input_process="_matrix_operations", output_process="_matrix_operations")
62
+ def create_co_occurrence_matrix(self, *, matrix: csr_matrix, rows: Labels = None, columns: Labels = None) -> csc_matrix:
63
+ """
64
+ Creates a Co-occurrence matrix.
65
+
66
+ :param matrix: A Document Term Matrix (DTM) from which o build a Co-occurrence matrix.
67
+ """
68
+
69
+ co_matrix = matrix.T @ matrix
70
+
71
+ return MatrixRepresentation(
72
+ matrix=co_matrix,
73
+ rows=rows,
74
+ columns=columns
75
+ )
76
+
77
+ @requires(stages=["matrix_operations"])
78
+ @workflow(input_arg="matrix", input_process="_matrix_operations", output_process="_end", next_stage="end")
79
+ def to_dataframe(self, *, matrix: MatrixRepresentation, **kwargs) -> DataFrame:
80
+ """
81
+ Creates a dataframe from a matrix representation.
82
+
83
+ :param matrix: A matrix representation from which to create a dataframe.
84
+ :param kwargs: Additional keyword arguments to pass to sklearn's pandas' DataFrame.
85
+ """
86
+
87
+ return matrix_to_dataframe(matrix=matrix, **kwargs)