dstklib 1.0.2__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstk/__init__.py +10 -12
- dstk/adaptors/__init__.py +2 -0
- dstk/adaptors/adaptors.py +91 -0
- dstk/adaptors/typeguards.py +141 -0
- dstk/hooks/__init__.py +2 -0
- dstk/hooks/hook_tools.py +89 -0
- dstk/hooks/type_conversion.py +40 -0
- dstk/lib_types/__init__.py +2 -3
- dstk/lib_types/dstk_types.py +188 -16
- dstk/lib_types/plotly_types.py +1 -0
- dstk/method_index.py +32 -0
- dstk/models/__init__.py +2 -0
- dstk/models/model_tools.py +83 -0
- dstk/models/models.py +191 -0
- dstk/modules/__init__.py +10 -0
- dstk/modules/count_models.py +91 -0
- dstk/modules/data_visualization/__init__.py +2 -0
- dstk/modules/data_visualization/clustering.py +129 -0
- dstk/modules/data_visualization/embeddings.py +101 -0
- dstk/modules/geometric_distance.py +114 -0
- dstk/modules/ngrams.py +156 -0
- dstk/modules/predict_models.py +109 -0
- dstk/modules/text_matrix_builder.py +55 -0
- dstk/modules/text_processor.py +100 -0
- dstk/modules/tokenizer.py +139 -0
- dstk/modules/weight_matrix.py +65 -0
- dstk/templates/__init__.py +2 -0
- dstk/templates/rules.py +59 -0
- dstk/templates/templates.py +231 -0
- dstk/workflows/__init__.py +2 -0
- dstk/workflows/stage_workflows.py +55 -0
- dstk/workflows/workflow_tools.py +383 -0
- dstklib-2.0.0.dist-info/METADATA +377 -0
- dstklib-2.0.0.dist-info/RECORD +43 -0
- dstk/collocations.py +0 -121
- dstk/count_models.py +0 -112
- dstk/geometric_distance.py +0 -107
- dstk/lib_types/matplotlib_types.py +0 -4
- dstk/lib_types/nltk_types.py +0 -1
- dstk/matrix_base.py +0 -113
- dstk/pipeline_tools.py +0 -27
- dstk/pipelines.py +0 -114
- dstk/plot_embeddings.py +0 -240
- dstk/predict_models.py +0 -189
- dstk/text_matrix_builder.py +0 -87
- dstk/text_processor.py +0 -450
- dstk/weight_matrix.py +0 -71
- dstk/workflow_tools.py +0 -257
- dstklib-1.0.2.dist-info/METADATA +0 -369
- dstklib-1.0.2.dist-info/RECORD +0 -28
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/LICENSE +0 -0
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/WHEEL +0 -0
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/top_level.txt +0 -0
dstk/geometric_distance.py
DELETED
@@ -1,107 +0,0 @@
|
|
1
|
-
from sklearn.neighbors import NearestNeighbors
|
2
|
-
import pandas as pd
|
3
|
-
import numpy as np
|
4
|
-
from sklearn.metrics.pairwise import cosine_similarity
|
5
|
-
from .workflow_tools import requires, workflow, WorkflowManager
|
6
|
-
|
7
|
-
from .lib_types import ndarray, Series, DataFrame
|
8
|
-
|
9
|
-
STAGES = [
|
10
|
-
"start", # The embeddings array
|
11
|
-
"end" # After a distance has been applied
|
12
|
-
]
|
13
|
-
|
14
|
-
class GeometricDistance(WorkflowManager):
|
15
|
-
"""
|
16
|
-
Provides a set of methods to calculate the distance between the embeddings of words, such as Euclidean distance, Manhattan distance, Cosine similarity, Nearest neighbors, etc.
|
17
|
-
"""
|
18
|
-
|
19
|
-
def __init__(self, embeddings: DataFrame | None = None):
|
20
|
-
"""
|
21
|
-
Initializes GeometricDistance with given attributes.
|
22
|
-
|
23
|
-
:param embeddings: A matrix of word embeddings.
|
24
|
-
:param vocab: Sequence of words representing the vocabulary aligned with the embeddings.
|
25
|
-
"""
|
26
|
-
|
27
|
-
super().__init__()
|
28
|
-
|
29
|
-
self._set_workflow(input_arg=embeddings)
|
30
|
-
|
31
|
-
@requires(stages=["start", "end"], multiple_calls=True)
|
32
|
-
@workflow(input_arg="embeddings", input_process="_start", output_process="_end", next_stage="end") # It would be interesting if you could select a set of distances as then result return all of them for comparision. Or you could call different words and return an array wit the result of all of them.
|
33
|
-
def euclidean_distance(self, *, embeddings: DataFrame, first_word: str, second_word: str) -> float:
|
34
|
-
"""
|
35
|
-
Computes the Euclidean distance between the embeddings of two words.
|
36
|
-
|
37
|
-
:param embeddings: A dataframe containing the word embeddings.
|
38
|
-
:param first_word: The first word in the pair.
|
39
|
-
:param second_word: The second word in the pair.
|
40
|
-
"""
|
41
|
-
|
42
|
-
first_word_vector: Series = embeddings.loc[first_word]
|
43
|
-
second_word_vector: Series = embeddings.loc[second_word]
|
44
|
-
|
45
|
-
return float(np.linalg.norm(first_word_vector - second_word_vector))
|
46
|
-
|
47
|
-
@requires(stages=["start", "end"], multiple_calls=True)
|
48
|
-
@workflow(input_arg="embeddings", input_process="_start", output_process="_end", next_stage="end")
|
49
|
-
def manhattan_distance(self, *, embeddings: DataFrame, first_word: str, second_word: str) -> float:
|
50
|
-
"""
|
51
|
-
Computes the Manhattan distance between the embeddings of two words.
|
52
|
-
|
53
|
-
:param embeddings: A dataframe containing the word embeddings.
|
54
|
-
:param first_word: The first word in the pair.
|
55
|
-
:param second_word: The second word in the pair.
|
56
|
-
"""
|
57
|
-
|
58
|
-
first_word_vector: Series = embeddings.loc[first_word]
|
59
|
-
second_word_vector: Series = embeddings.loc[second_word]
|
60
|
-
|
61
|
-
return np.sum(np.abs(first_word_vector - second_word_vector))
|
62
|
-
|
63
|
-
@requires(stages=["start", "end"], multiple_calls=True)
|
64
|
-
@workflow(input_arg="embeddings", input_process="_start", output_process="_end", next_stage="end")
|
65
|
-
def cos_similarity(self, *, embeddings: DataFrame, first_word: str, second_word: str) -> float:
|
66
|
-
"""
|
67
|
-
Computes the cosine similarity between the embeddings of two words.
|
68
|
-
|
69
|
-
:param embeddings: A dataframe containing the word embeddings.
|
70
|
-
:param first_word: The first word in the pair.
|
71
|
-
:param second_word: The second word in the pair.
|
72
|
-
"""
|
73
|
-
|
74
|
-
first_word_vector: ndarray = np.array(embeddings.loc[first_word]).reshape(1, -1)
|
75
|
-
second_word_vector: ndarray = np.array(embeddings.loc[second_word]).reshape(1, -1)
|
76
|
-
|
77
|
-
cos_sim: ndarray = cosine_similarity(first_word_vector, second_word_vector)
|
78
|
-
|
79
|
-
return cos_sim[0][0]
|
80
|
-
|
81
|
-
@requires(stages=["start", "end"], multiple_calls=True)
|
82
|
-
@workflow(input_arg="embeddings", input_process="_start", output_process="_end", next_stage="end")
|
83
|
-
def nearest_neighbors(self, *, embeddings: DataFrame, word: str, metric: str, n_words: int = 5, **kwargs) -> list[tuple[str, float]]:
|
84
|
-
"""
|
85
|
-
Returns the top N most semantically similar words to a given target word, based on the specified distance or similarity metric.
|
86
|
-
|
87
|
-
:param embeddings: A dataframe containing the word embeddings.
|
88
|
-
:param word: The target word to find neighbors for.
|
89
|
-
:param metric: The distance or similarity metric to use (e.g., 'cosine', 'euclidean').
|
90
|
-
:param n_words: Number of nearest neighbors to return. Defaults to 5.
|
91
|
-
:param kwargs: Additional keyword arguments to pass to sklearn's NearestNeighbors. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
|
92
|
-
"""
|
93
|
-
|
94
|
-
neighbors: NearestNeighbors = NearestNeighbors(n_neighbors=n_words, algorithm="auto", metric=metric, **kwargs)
|
95
|
-
neighbors.fit(embeddings.to_numpy())
|
96
|
-
|
97
|
-
word_vector: Series = embeddings.loc[word]
|
98
|
-
|
99
|
-
distances: ndarray
|
100
|
-
indices: ndarray
|
101
|
-
distances, indices = neighbors.kneighbors([word_vector], n_neighbors=n_words + 1)
|
102
|
-
|
103
|
-
neighbor_tuples = zip(indices[0], distances[0])
|
104
|
-
|
105
|
-
results: list[tuple[str, float]] = [(embeddings.index[index], 1 - distance) for index, distance in neighbor_tuples if embeddings.index[index] != word]
|
106
|
-
|
107
|
-
return results
|
dstk/lib_types/nltk_types.py
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
from nltk import Text
|
dstk/matrix_base.py
DELETED
@@ -1,113 +0,0 @@
|
|
1
|
-
from dataclasses import dataclass, field
|
2
|
-
import pandas as pd
|
3
|
-
from .lib_types import Matrix, Labels, ndarray, DataFrame, csr_matrix, csc_matrix
|
4
|
-
from .workflow_tools import accepts_generic
|
5
|
-
|
6
|
-
from typing import Any, cast, Callable
|
7
|
-
|
8
|
-
@dataclass
|
9
|
-
class MatrixRepresentation:
|
10
|
-
"""
|
11
|
-
Container for a matrix and its associated row and column labels, with optional metadata.
|
12
|
-
|
13
|
-
:param matrix: The core matrix data, typically a NumPy array.
|
14
|
-
:param rows: Optional row labels.
|
15
|
-
:param columns: Optional column labels.
|
16
|
-
:param meta: Optional dictionary to store additional metadata.
|
17
|
-
"""
|
18
|
-
|
19
|
-
matrix: Matrix
|
20
|
-
rows: Labels = None
|
21
|
-
columns: Labels = None
|
22
|
-
meta: dict[str, Any] = field(default_factory=dict)
|
23
|
-
|
24
|
-
def accept_matrix_representation(accepts: bool = True, custom_error_message: str = "", intercept: bool = True, meta: str | None = None, override: tuple[str, Any] | None = None) -> Callable:
|
25
|
-
"""
|
26
|
-
Decorator that allows a method to accept a MatrixRepresentation object as input.
|
27
|
-
|
28
|
-
Extracts the underlying matrix for processing and optionally reattaches metadata, labels, or captures non-matrix outputs into metadata.
|
29
|
-
|
30
|
-
:param accepts: Whether to accept MatrixRepresentation inputs.
|
31
|
-
:param custom_error_message: Optional error message if input type is not accepted.
|
32
|
-
:param intercept: Whether to intercept the input and repackage the output.
|
33
|
-
:param meta: Optional metadata key to store non-matrix outputs in the result.
|
34
|
-
|
35
|
-
:return: A decorated method that supports MatrixRepresentation as input.
|
36
|
-
"""
|
37
|
-
|
38
|
-
def is_matrix_representation(matrix: Any) -> bool:
|
39
|
-
"""
|
40
|
-
If matrix is an instance of MatrixRepresentation, returns True. Else, returns False.
|
41
|
-
|
42
|
-
:param matrix: A matrix to check its instance.
|
43
|
-
"""
|
44
|
-
|
45
|
-
return True if isinstance(matrix, MatrixRepresentation) else False
|
46
|
-
|
47
|
-
def intercept_matrix(self, input_value: MatrixRepresentation, method: Callable, *args, **kwargs) -> MatrixRepresentation:
|
48
|
-
result: MatrixRepresentation | Any = method(self, *args, matrix=input_value.matrix, **kwargs)
|
49
|
-
|
50
|
-
matrix: MatrixRepresentation
|
51
|
-
|
52
|
-
if isinstance(result, MatrixRepresentation):
|
53
|
-
matrix = result
|
54
|
-
|
55
|
-
matrix.rows = input_value.rows
|
56
|
-
matrix.columns = input_value.columns
|
57
|
-
|
58
|
-
if override:
|
59
|
-
attr, value = override
|
60
|
-
setattr(matrix, attr, value)
|
61
|
-
else:
|
62
|
-
if isinstance(result, ndarray):
|
63
|
-
matrix = MatrixRepresentation(result, input_value.rows, input_value.columns)
|
64
|
-
elif not isinstance(result, ndarray) and meta:
|
65
|
-
matrix = MatrixRepresentation(input_value.matrix, input_value.rows, input_value.columns)
|
66
|
-
matrix.meta[meta] = result
|
67
|
-
|
68
|
-
return matrix
|
69
|
-
|
70
|
-
return accepts_generic(
|
71
|
-
type_checker=is_matrix_representation,
|
72
|
-
input_arg="matrix",
|
73
|
-
accepts=accepts,
|
74
|
-
intercept=intercept,
|
75
|
-
interceptor=lambda self, input_value, method, *args, **kwargs: intercept_matrix(self, input_value, method, *args, **kwargs),
|
76
|
-
input_type=MatrixRepresentation,
|
77
|
-
custom_error_message=custom_error_message
|
78
|
-
)
|
79
|
-
|
80
|
-
def matrix_to_dataframe(matrix: MatrixRepresentation, **kwargs):
|
81
|
-
"""
|
82
|
-
Converts a MatrixRepresentation to a pandas DataFrame.
|
83
|
-
|
84
|
-
:param matrix: A MatrixRepresentation instance.
|
85
|
-
:param kwargs: Additional keyword arguments to pass to sklearn's pandas' DataFrame.
|
86
|
-
|
87
|
-
:return: A pandas DataFrame with corresponding data and labels.
|
88
|
-
"""
|
89
|
-
|
90
|
-
if isinstance(matrix.matrix, csr_matrix) or isinstance(matrix.matrix, csc_matrix):
|
91
|
-
matrix.matrix = matrix.matrix.toarray()
|
92
|
-
|
93
|
-
return pd.DataFrame(
|
94
|
-
matrix.matrix,
|
95
|
-
index=matrix.rows,
|
96
|
-
columns=matrix.columns,
|
97
|
-
**kwargs
|
98
|
-
)
|
99
|
-
|
100
|
-
def dataframe_to_matrix(dataframe: DataFrame):
|
101
|
-
"""
|
102
|
-
Converts a pandas DataFrame to a MatrixRepresentation.
|
103
|
-
|
104
|
-
:param dataframe: A pandas DataFrame.
|
105
|
-
|
106
|
-
:return: A MatrixRepresentation with matrix data and index/column labels.
|
107
|
-
"""
|
108
|
-
|
109
|
-
return MatrixRepresentation(
|
110
|
-
matrix=dataframe.to_numpy(),
|
111
|
-
rows=list(dataframe.index),
|
112
|
-
columns=list(dataframe.columns)
|
113
|
-
)
|
dstk/pipeline_tools.py
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
from .workflow_tools import WorkflowBuilder, WorkflowManager
|
2
|
-
|
3
|
-
from typing import Any, Callable
|
4
|
-
|
5
|
-
class PipelineBuilder:
|
6
|
-
"""
|
7
|
-
Automates the execution of a sequence of workflows on a WorkflowBuilder or Callable subclass.
|
8
|
-
|
9
|
-
:param workflows: A subclass of WorkflowBuilder representing the workflow to execute or a function to hook in the pipeline.
|
10
|
-
"""
|
11
|
-
|
12
|
-
def __init__(self, workflows: list[WorkflowBuilder | Callable]):
|
13
|
-
"""
|
14
|
-
Initializes WorkflowBuilder with given attributes.
|
15
|
-
"""
|
16
|
-
|
17
|
-
self.workflows: list[WorkflowBuilder | Callable] = workflows
|
18
|
-
|
19
|
-
def __call__(self, **kwargs) -> Any:
|
20
|
-
workflows: list[WorkflowBuilder | Callable] = self.workflows
|
21
|
-
entry_workflow: WorkflowBuilder | Callable = workflows.pop(0)
|
22
|
-
result: Any = entry_workflow(**kwargs)
|
23
|
-
|
24
|
-
for workflow in workflows:
|
25
|
-
result = workflow(result)
|
26
|
-
|
27
|
-
return result
|
dstk/pipelines.py
DELETED
@@ -1,114 +0,0 @@
|
|
1
|
-
from .workflow_tools import WorkflowBuilder
|
2
|
-
from .pipeline_tools import PipelineBuilder
|
3
|
-
from dstk.lib_types import Language, DataFrame
|
4
|
-
|
5
|
-
from .text_processor import TextProcessor
|
6
|
-
from .text_matrix_builder import TextMatrixBuilder
|
7
|
-
from .weight_matrix import WeightMatrix
|
8
|
-
from .count_models import CountModels
|
9
|
-
from .geometric_distance import GeometricDistance
|
10
|
-
from .predict_models import PredictModels
|
11
|
-
|
12
|
-
def StandardModel(text: str, model: str | Language, window_size: int = 2, components: int = 100) -> DataFrame:
|
13
|
-
"""
|
14
|
-
This pipeline generates word embeddings using the standard model as defined by (Lenci & Sahlgren 97). It preprocesses the text by removing stop words, lowering the words and segmenting the text using a context window. The co-occurrence matrix is weighted with PPMI and reduced with truncated SVD.
|
15
|
-
|
16
|
-
:param text: The text to extract the embeddings from.
|
17
|
-
:param model: The spaCy NLP model to tokenize the text.
|
18
|
-
:param window_size: The size of the context window to segment the text. Defaults to 2.
|
19
|
-
:param components: The number of dimensions of the embeddings. Defaults to 100.
|
20
|
-
"""
|
21
|
-
|
22
|
-
StandardTextWorkflow: WorkflowBuilder = WorkflowBuilder(
|
23
|
-
work_class=TextProcessor,
|
24
|
-
method_representation={
|
25
|
-
"set_model": {"model": model},
|
26
|
-
"get_tokens": {},
|
27
|
-
"remove_stop_words": {},
|
28
|
-
"get_text": {"lemmatize": True},
|
29
|
-
"to_lower": {},
|
30
|
-
"corpus_by_context_window": {"window_size": window_size}
|
31
|
-
}
|
32
|
-
)
|
33
|
-
|
34
|
-
StandardMatrix: WorkflowBuilder = WorkflowBuilder(
|
35
|
-
work_class=TextMatrixBuilder,
|
36
|
-
method_representation={
|
37
|
-
"create_dtm": {},
|
38
|
-
"create_co_occurrence_matrix": {},
|
39
|
-
"to_dataframe": {}
|
40
|
-
}
|
41
|
-
)
|
42
|
-
|
43
|
-
StandardWeightMatrix: WorkflowBuilder = WorkflowBuilder(
|
44
|
-
work_class=WeightMatrix,
|
45
|
-
method_representation={
|
46
|
-
"pmi": {"positive": True}
|
47
|
-
}
|
48
|
-
)
|
49
|
-
|
50
|
-
StandardCountModels: WorkflowBuilder = WorkflowBuilder(
|
51
|
-
work_class=CountModels,
|
52
|
-
method_representation={
|
53
|
-
"scale_matrix": {},
|
54
|
-
"svd_embeddings": {"n_components": components},
|
55
|
-
"to_dataframe": {}
|
56
|
-
}
|
57
|
-
)
|
58
|
-
|
59
|
-
Model: PipelineBuilder = PipelineBuilder([
|
60
|
-
StandardTextWorkflow,
|
61
|
-
StandardMatrix,
|
62
|
-
StandardWeightMatrix,
|
63
|
-
StandardCountModels
|
64
|
-
])
|
65
|
-
|
66
|
-
return Model(text=text)
|
67
|
-
|
68
|
-
def SGNSModel(text: str, model: str | Language, path: str, **kwargs) -> PredictModels:
|
69
|
-
"""
|
70
|
-
This pipeline generates word embeddings using Skip-Gram with Negative Sampling (SGNS) as defined by (Lenci & Sahlgren 162). It preprocesses the text by extracting the sentences, removing stop words and lowering them. The embeddings are extracted by using word2vec to do SGNS. Returns an instance of PredictModels.
|
71
|
-
|
72
|
-
:param text: The text to extract the embeddings from.
|
73
|
-
:param model: The spaCy NLP model to tokenize the text.
|
74
|
-
:param path: The path to save the processed senteces.
|
75
|
-
:param kwargs: Additional keyword arguments to pass to gensim.models.Word2Vec. Common options include:
|
76
|
-
- vector_size: Size of the word embedding vectors.
|
77
|
-
- workers: Number of CPU cores to be used during the training process.
|
78
|
-
- negative: Specifies how many "noise words" to sample for each positive example during training. Typical values range from 5 to 20. Higher values make training slower but can improve embedding quality.
|
79
|
-
- window (int): Maximum distance between the current and predicted word.
|
80
|
-
- min_count (int): Ignores all words with total frequency lower than this.
|
81
|
-
|
82
|
-
For more information check: https://radimrehurek.com/gensim/models/word2vec.html
|
83
|
-
"""
|
84
|
-
|
85
|
-
SGNSTextWorkflow: WorkflowBuilder = WorkflowBuilder(
|
86
|
-
work_class=TextProcessor,
|
87
|
-
method_representation={
|
88
|
-
"set_model": {"model": model},
|
89
|
-
"get_sentences": {},
|
90
|
-
"remove_stop_words": {},
|
91
|
-
"get_text": {"lemmatize": True},
|
92
|
-
"to_lower": {},
|
93
|
-
"join": {},
|
94
|
-
"save_to_file": {"path": path}
|
95
|
-
}
|
96
|
-
)
|
97
|
-
|
98
|
-
SGNSPredictWorkflow: WorkflowBuilder = WorkflowBuilder(
|
99
|
-
work_class=PredictModels,
|
100
|
-
method_representation={
|
101
|
-
"word2vec": {
|
102
|
-
"sg": 1,
|
103
|
-
**kwargs
|
104
|
-
}
|
105
|
-
},
|
106
|
-
result=False
|
107
|
-
)
|
108
|
-
|
109
|
-
Model: PipelineBuilder = PipelineBuilder([
|
110
|
-
SGNSTextWorkflow,
|
111
|
-
SGNSPredictWorkflow
|
112
|
-
])
|
113
|
-
|
114
|
-
return Model(text=text)
|
dstk/plot_embeddings.py
DELETED
@@ -1,240 +0,0 @@
|
|
1
|
-
import matplotlib.pyplot as plt
|
2
|
-
from sklearn.cluster import KMeans
|
3
|
-
from sklearn.metrics import silhouette_score
|
4
|
-
from kneed import KneeLocator
|
5
|
-
from umap import UMAP
|
6
|
-
from .workflow_tools import requires, workflow, WorkflowManager
|
7
|
-
from .matrix_base import MatrixRepresentation, accept_matrix_representation
|
8
|
-
|
9
|
-
from .lib_types import ndarray, PathCollection, Axes, Axes3D, Labels
|
10
|
-
|
11
|
-
STAGES = [
|
12
|
-
"start", # The embeddings matrix representation
|
13
|
-
"clusters", # The ideal number of clusters to be plotted
|
14
|
-
"end" # The result of the plot
|
15
|
-
]
|
16
|
-
|
17
|
-
# Maybe return the interias and the highest score?
|
18
|
-
class PlotEmbeddings(WorkflowManager):
|
19
|
-
"""
|
20
|
-
Provides a set of methods to visualize word embeddings in both 2D and 3D.
|
21
|
-
|
22
|
-
:param embeddings: A matrix representation of word embeddings.
|
23
|
-
"""
|
24
|
-
|
25
|
-
def __init__(self, embeddings: MatrixRepresentation | None = None):
|
26
|
-
"""
|
27
|
-
Initializes PlotEmbeddings with given attributes.
|
28
|
-
"""
|
29
|
-
|
30
|
-
super().__init__()
|
31
|
-
|
32
|
-
# Stages
|
33
|
-
|
34
|
-
self._clusters: tuple[int, float]
|
35
|
-
|
36
|
-
self._set_workflow(input_arg=embeddings)
|
37
|
-
|
38
|
-
@requires(stages=["start"])
|
39
|
-
@workflow(input_arg="matrix", input_process="_start", output_process="_clusters", next_stage="clusters")
|
40
|
-
@accept_matrix_representation(meta="n_clusters")
|
41
|
-
def elbow_analysis(self, *, matrix: ndarray, max_k: int, show: bool = False, path: str | None = None) -> int:
|
42
|
-
"""
|
43
|
-
Generates an Elbow plot to help determine the optimal number of clusters for the word embeddings. Returns the best number of clusters.
|
44
|
-
|
45
|
-
:param matrix: An array to be clustered.
|
46
|
-
:param max_k: The maximum number of clusters to evaluate when applying the Elbow method.
|
47
|
-
:param show: If True, shows the plot. Defaults to False.
|
48
|
-
:param path: If provided, saves the plot in the specified path. Defaults to None.
|
49
|
-
|
50
|
-
This method supports different matrix forms due to decorator-based preprocessing:
|
51
|
-
- matrix: ndarray
|
52
|
-
- matrix representation: MatrixRepresentation
|
53
|
-
"""
|
54
|
-
plt.clf()
|
55
|
-
plt.close("all")
|
56
|
-
|
57
|
-
means: list[int] = []
|
58
|
-
inertias: list[float] = []
|
59
|
-
|
60
|
-
for k in range(1, max_k):
|
61
|
-
kmeans: KMeans = KMeans(n_clusters=k, random_state=42)
|
62
|
-
kmeans.fit(matrix)
|
63
|
-
|
64
|
-
means.append(k)
|
65
|
-
inertias.append(kmeans.inertia_)
|
66
|
-
|
67
|
-
elbow: KneeLocator = KneeLocator(means, inertias, curve="convex", direction="decreasing")
|
68
|
-
|
69
|
-
print(f"The best cluster is {elbow.knee} with an inertia of {elbow.knee_y}")
|
70
|
-
|
71
|
-
elbow.plot_knee()
|
72
|
-
|
73
|
-
if path:
|
74
|
-
plt.savefig(path)
|
75
|
-
|
76
|
-
if show:
|
77
|
-
plt.show()
|
78
|
-
|
79
|
-
return elbow.knee
|
80
|
-
|
81
|
-
@requires(stages=["start"])
|
82
|
-
@workflow(input_arg="matrix", input_process="_start", output_process="_clusters", next_stage="clusters")
|
83
|
-
@accept_matrix_representation(meta="n_clusters")
|
84
|
-
def extract_silhouette_score(self, *, matrix: ndarray, max_k: int, show: bool = False, path: str | None = None, **kwargs) -> int:
|
85
|
-
"""
|
86
|
-
Extracts and plots the Silhouette score to help determine the optimal number of clusters for the word embeddings. Returns the best number of clusters based on the highest score.
|
87
|
-
|
88
|
-
:param matrix: An array to be clustered.
|
89
|
-
:param max_k: The maximum number of clusters to evaluate when computing the Silhouette score.
|
90
|
-
:param show: If True, shows the plot. Defaults to False.
|
91
|
-
:param path: If provided, saves the plot in the specified path. Defaults to None.
|
92
|
-
:param kwargs: Additional keyword arguments to pass to sklearn.metrics silhouette_score. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
|
93
|
-
|
94
|
-
This method supports different matrix forms due to decorator-based preprocessing:
|
95
|
-
- matrix: ndarray
|
96
|
-
- matrix representation: MatrixRepresentation
|
97
|
-
"""
|
98
|
-
plt.clf()
|
99
|
-
plt.close("all")
|
100
|
-
|
101
|
-
sil_scores: list[tuple[int, float]] = []
|
102
|
-
|
103
|
-
for k in range(2, max_k):
|
104
|
-
kmeans: KMeans = KMeans(n_clusters=k, random_state=42)
|
105
|
-
kmeans.fit(matrix)
|
106
|
-
sil_score: float = silhouette_score(matrix, kmeans.labels_)
|
107
|
-
sil_scores.append((k, sil_score), **kwargs)
|
108
|
-
|
109
|
-
highest_score: tuple[int, float] = max(sil_scores, key=lambda tup: tup[1])
|
110
|
-
print(f"The best cluster is {highest_score[0]} with a Silhouette score of {highest_score[1]}")
|
111
|
-
|
112
|
-
clusters, scores = zip(*sil_scores)
|
113
|
-
|
114
|
-
plt.plot(clusters, scores, 'o-')
|
115
|
-
|
116
|
-
plt.xlabel("Number of Clusters")
|
117
|
-
plt.ylabel("Silhouette Scores")
|
118
|
-
plt.title("Silhouette Score Plot")
|
119
|
-
plt.grid(True)
|
120
|
-
|
121
|
-
if path:
|
122
|
-
plt.savefig(path)
|
123
|
-
|
124
|
-
if show:
|
125
|
-
plt.show()
|
126
|
-
|
127
|
-
return highest_score[0]
|
128
|
-
|
129
|
-
@requires(stages=["clusters"])
|
130
|
-
@workflow(input_arg="embeddings", input_attrs={"embeddings": None, "n_clusters": {"meta": "n_clusters"}}, input_process="_clusters", output_process="_end", next_stage="end")
|
131
|
-
def plot_embeddings_2D(self, *, embeddings: MatrixRepresentation, n_clusters: int = 1, alpha: float = 0.8, font_size: int = 8, grid: bool = True, show: bool = True, path: str | None = None, umap_neighbors: int = 15, umap_metric: str = "cosine", umap_dist: float = 0.1) -> PathCollection:
|
132
|
-
"""
|
133
|
-
Generates a 2D plot of the word embedddings using UMAP for dimensionality reduction.
|
134
|
-
|
135
|
-
:param embeddings: A matrix representation of word embeddings.
|
136
|
-
:param n_clusters: The number of clusters to form when grouping the word embeddings. Defaluts to 1.
|
137
|
-
:param alpha: The transparency level (alpha) of the plotted dots, between 0 (fully transparent) and 1 (fully opaque). Defaults to 0.8.
|
138
|
-
:param font_size: Specifies the font size for the labels of the plotted dots. Defaults to 8.
|
139
|
-
:param grid: If True, displays a grid on the plot; if False, hides it. Defaults to True.
|
140
|
-
:param show: If True, shows the plot. Defaults to True.
|
141
|
-
:param path: If provided, saves the plot in the specified path. Defaults to None.
|
142
|
-
:param umap_neighbors: Controls how UMAP balances local versus global structure. Higher values consider a broader context when reducing dimensions. Defaults to 15.
|
143
|
-
:param umap_metric: The distance metric UMAP uses to assess similarity between words (e.g., "cosine", "euclidean"). Defaults to "cosine", which is common for word embeddings.
|
144
|
-
:param umap_dist: Controls how tightly UMAP packs points together. Lower values keep similar words closer in the 2D space. Defaults to 0.1.
|
145
|
-
|
146
|
-
:return: A matplotlib PathCollection object representing the plotted points.
|
147
|
-
"""
|
148
|
-
|
149
|
-
plt.clf()
|
150
|
-
plt.close("all")
|
151
|
-
|
152
|
-
embeddings_matrix: ndarray = embeddings.matrix
|
153
|
-
labels: Labels = embeddings.rows
|
154
|
-
|
155
|
-
reducer: UMAP = UMAP(n_components=2, n_neighbors=umap_neighbors, min_dist=umap_dist, metric=umap_metric)
|
156
|
-
umap_embeddings: ndarray = reducer.fit_transform(embeddings_matrix)
|
157
|
-
|
158
|
-
kmeans: KMeans = KMeans(n_clusters=n_clusters, random_state=42)
|
159
|
-
clusters: ndarray = kmeans.fit_predict(umap_embeddings)
|
160
|
-
scatter: PathCollection = plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=clusters, cmap='Spectral', alpha=alpha)
|
161
|
-
|
162
|
-
if labels is not None:
|
163
|
-
for index, label in enumerate(labels):
|
164
|
-
plt.annotate(label, (umap_embeddings[index, 0], umap_embeddings[index, 1]), fontsize=font_size, alpha=0.6)
|
165
|
-
else:
|
166
|
-
raise ValueError("Rows of embeddings are None. Make sure your matrix contains labeled rows.")
|
167
|
-
|
168
|
-
plt.xlabel("Axis 1")
|
169
|
-
plt.ylabel("Axis 2")
|
170
|
-
plt.title("Projection of word embeddings")
|
171
|
-
plt.colorbar(scatter, label="Cluster")
|
172
|
-
plt.grid(grid)
|
173
|
-
plt.tight_layout()
|
174
|
-
|
175
|
-
if path:
|
176
|
-
plt.savefig(path)
|
177
|
-
|
178
|
-
if show:
|
179
|
-
plt.show()
|
180
|
-
|
181
|
-
return scatter
|
182
|
-
|
183
|
-
@requires(stages=["clusters"])
|
184
|
-
@workflow(input_arg="embeddings", input_attrs={"embeddings": None, "n_clusters": {"meta": "n_clusters"}}, input_process="_clusters", output_process="_end", next_stage="end")
|
185
|
-
def plot_embeddings_3D(self, *, embeddings: MatrixRepresentation, n_clusters: int = 1, alpha: float = 0.8, font_size: int = 8, grid: bool = True, show: bool = True, path: str | None = None, umap_neighbors: int = 15, umap_metric: str = "cosine", umap_dist: float = 0.1) -> PathCollection:
|
186
|
-
"""
|
187
|
-
Generates a 3D plot of the word embedddings.
|
188
|
-
|
189
|
-
:param embeddings: A matrix representation of word embeddings.
|
190
|
-
:param n_clusters: The number of clusters to form when grouping the word embeddings. Defaults to 1.
|
191
|
-
:param alpha: The transparency level (alpha) of the plotted dots, between 0 (fully transparent) and 1 (fully opaque). Defaults to 0.8.
|
192
|
-
:param font_size: Specifies the font size for the labels of the plotted dots. Defaults to 8.
|
193
|
-
:param grid: If True, displays a grid on the plot; if False, hides it. Defaults to True.
|
194
|
-
:param show: If True, shows the plot. Defaults to True.
|
195
|
-
:param path: If provided, saves the plot in the specified path. Defaults to None.
|
196
|
-
:param umap_neighbors: Controls how UMAP balances local versus global structure. Higher values consider a broader context when reducing dimensions. Defaults to 15.
|
197
|
-
:param umap_metric: The distance metric UMAP uses to assess similarity between words (e.g., "cosine", "euclidean"). Defaults to "cosine", which is common for word embeddings.
|
198
|
-
:param umap_dist: Controls how tightly UMAP packs points together. Lower values keep similar words closer in the 2D space. Defaults to 0.1.
|
199
|
-
|
200
|
-
:return: A matplotlib PathCollection object representing the plotted points.
|
201
|
-
"""
|
202
|
-
plt.clf()
|
203
|
-
plt.close("all")
|
204
|
-
|
205
|
-
embeddings_matrix: ndarray = embeddings.matrix
|
206
|
-
labels: Labels = embeddings.rows
|
207
|
-
|
208
|
-
reducer: UMAP = UMAP(n_components=3, n_neighbors=umap_neighbors, min_dist=umap_dist, metric=umap_metric)
|
209
|
-
umap_embeddings: ndarray = reducer.fit_transform(embeddings_matrix)
|
210
|
-
|
211
|
-
kmeans: KMeans = KMeans(n_clusters=n_clusters, random_state=42)
|
212
|
-
clusters = kmeans.fit_predict(umap_embeddings)
|
213
|
-
|
214
|
-
ax: Axes3D = plt.axes(projection="3d")
|
215
|
-
|
216
|
-
scatter: PathCollection = ax.scatter(umap_embeddings
|
217
|
-
[:, 0], umap_embeddings[:, 1], umap_embeddings[:, 2], c=clusters, cmap="Spectral", alpha=alpha)
|
218
|
-
|
219
|
-
if labels is not None:
|
220
|
-
for index, label in enumerate(labels):
|
221
|
-
ax.text(umap_embeddings[index, 0], umap_embeddings[index, 1], umap_embeddings[index, 2], label, size=font_size, alpha=0.6)
|
222
|
-
else:
|
223
|
-
raise ValueError("Rows of embeddings are None. Make sure your matrix contains labeled rows.")
|
224
|
-
|
225
|
-
ax.set_xlabel("Axis 1")
|
226
|
-
ax.set_ylabel("Axis 2")
|
227
|
-
ax.set_zlabel("Axis 3")
|
228
|
-
ax.set_title("3D projection of word embeddings")
|
229
|
-
|
230
|
-
plt.colorbar(scatter, label="Cluster")
|
231
|
-
plt.grid(grid)
|
232
|
-
plt.tight_layout()
|
233
|
-
|
234
|
-
if path:
|
235
|
-
plt.savefig(path)
|
236
|
-
|
237
|
-
if show:
|
238
|
-
plt.show()
|
239
|
-
|
240
|
-
return scatter
|