dstklib 1.0.1__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstk/__init__.py +10 -12
- dstk/adaptors/__init__.py +2 -0
- dstk/adaptors/adaptors.py +91 -0
- dstk/adaptors/typeguards.py +141 -0
- dstk/hooks/__init__.py +2 -0
- dstk/hooks/hook_tools.py +89 -0
- dstk/hooks/type_conversion.py +40 -0
- dstk/lib_types/__init__.py +2 -3
- dstk/lib_types/dstk_types.py +188 -16
- dstk/lib_types/plotly_types.py +1 -0
- dstk/method_index.py +32 -0
- dstk/models/__init__.py +2 -0
- dstk/models/model_tools.py +83 -0
- dstk/models/models.py +191 -0
- dstk/modules/__init__.py +10 -0
- dstk/modules/count_models.py +91 -0
- dstk/modules/data_visualization/__init__.py +2 -0
- dstk/modules/data_visualization/clustering.py +129 -0
- dstk/modules/data_visualization/embeddings.py +101 -0
- dstk/modules/geometric_distance.py +114 -0
- dstk/modules/ngrams.py +156 -0
- dstk/modules/predict_models.py +109 -0
- dstk/modules/text_matrix_builder.py +55 -0
- dstk/modules/text_processor.py +100 -0
- dstk/modules/tokenizer.py +139 -0
- dstk/modules/weight_matrix.py +65 -0
- dstk/templates/__init__.py +2 -0
- dstk/templates/rules.py +59 -0
- dstk/templates/templates.py +231 -0
- dstk/workflows/__init__.py +2 -0
- dstk/workflows/stage_workflows.py +55 -0
- dstk/workflows/workflow_tools.py +383 -0
- dstklib-2.0.0.dist-info/METADATA +377 -0
- dstklib-2.0.0.dist-info/RECORD +43 -0
- dstk/collocations.py +0 -121
- dstk/count_models.py +0 -112
- dstk/geometric_distance.py +0 -107
- dstk/lib_types/matplotlib_types.py +0 -4
- dstk/lib_types/nltk_types.py +0 -1
- dstk/matrix_base.py +0 -113
- dstk/pipeline_tools.py +0 -27
- dstk/pipelines.py +0 -114
- dstk/plot_embeddings.py +0 -240
- dstk/predict_models.py +0 -189
- dstk/text_matrix_builder.py +0 -87
- dstk/text_processor.py +0 -450
- dstk/weight_matrix.py +0 -71
- dstk/workflow_tools.py +0 -257
- dstklib-1.0.1.dist-info/METADATA +0 -360
- dstklib-1.0.1.dist-info/RECORD +0 -28
- {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/LICENSE +0 -0
- {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/WHEEL +0 -0
- {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/top_level.txt +0 -0
dstk/predict_models.py
DELETED
@@ -1,189 +0,0 @@
|
|
1
|
-
from gensim.models import Word2Vec
|
2
|
-
import fasttext
|
3
|
-
import numpy as np
|
4
|
-
from sklearn.metrics.pairwise import cosine_similarity
|
5
|
-
from pathlib import Path
|
6
|
-
from .workflow_tools import requires, workflow, WorkflowManager
|
7
|
-
from .matrix_base import MatrixRepresentation
|
8
|
-
|
9
|
-
from .lib_types import ndarray, FastText
|
10
|
-
|
11
|
-
# The workflow needs to be completely restructured
|
12
|
-
|
13
|
-
STAGES = [
|
14
|
-
"start", # The path to the sentences file
|
15
|
-
"predict_model", # After a model has been generated,
|
16
|
-
"embeddings_operations" # After a metric distance has ben applied
|
17
|
-
"end" # After the model was saved or the model was turned into matrix representation
|
18
|
-
]
|
19
|
-
|
20
|
-
class PredictModels(WorkflowManager):
|
21
|
-
"""
|
22
|
-
Provides a unified interface to work seamlessly with Gensim's Word2Vec and Facebook's FastText models.
|
23
|
-
|
24
|
-
This class simplifies the process of training, loading, and using word embeddings by integrating both popular algorithms under a single API.
|
25
|
-
|
26
|
-
:param path: The path to a file conatining a list of sentences or collocations from which to build word embeddings.
|
27
|
-
"""
|
28
|
-
|
29
|
-
_start: str
|
30
|
-
_end: str | MatrixRepresentation
|
31
|
-
|
32
|
-
def __init__(self, path: str | None = None):
|
33
|
-
"""
|
34
|
-
Initializes PredictModels with given attributes.
|
35
|
-
"""
|
36
|
-
|
37
|
-
super().__init__()
|
38
|
-
|
39
|
-
# Stages
|
40
|
-
|
41
|
-
self._predict_model: Word2Vec | FastText
|
42
|
-
self._embeddings_operations: list[tuple[str,float]] | float
|
43
|
-
|
44
|
-
self._set_workflow(input_arg=path)
|
45
|
-
|
46
|
-
@requires(stages=["start"])
|
47
|
-
@workflow(input_arg="path", input_process="_start", output_process="_predict_model", next_stage="predict_model")
|
48
|
-
def word2vec(self, *, path: str, **kwargs) -> Word2Vec:
|
49
|
-
"""
|
50
|
-
Creates word embeddings using the Word2Vec algorithm.
|
51
|
-
|
52
|
-
:param path: The path to a file conatining a list of sentences or collocations from which to build word embeddings.
|
53
|
-
:param kwargs: Additional keyword arguments to pass to gensim.models.Word2Vec. Common options include:
|
54
|
-
- vector_size: Size of the word embedding vectors.
|
55
|
-
- workers: Number of CPU cores to be used during the training process.
|
56
|
-
- sg: Training algorithm. 1 for skip-gram; 0 for CBOW (Continuous Bag of Words).
|
57
|
-
- window (int): Maximum distance between the current and predicted word.
|
58
|
-
- min_count (int): Ignores all words with total frequency lower than this.
|
59
|
-
|
60
|
-
For more information check: https://radimrehurek.com/gensim/models/word2vec.html
|
61
|
-
"""
|
62
|
-
|
63
|
-
return Word2Vec(
|
64
|
-
corpus_file=path,
|
65
|
-
**kwargs
|
66
|
-
)
|
67
|
-
|
68
|
-
@requires(stages=["start"])
|
69
|
-
@workflow(input_arg="path", input_process="_start", output_process="_predict_model", next_stage="predict_model")
|
70
|
-
def fastText(self, *, path: str, **kwargs) -> FastText:
|
71
|
-
"""
|
72
|
-
Creates word embeddings using the FastText algorithm.
|
73
|
-
|
74
|
-
:param path: The path to a file conatining a list of sentences or collocations from which to build word embeddings.
|
75
|
-
:param kwargs: Additional keyword arguments to pass to fasttext.train_unsupervised.
|
76
|
-
Common options include:
|
77
|
-
- dim: Size of the word embedding vectors.
|
78
|
-
- model: Training algorithm: skipgram or cbow (Continuous Bag of Words)
|
79
|
-
- thread: Number of CPU cores to be used during the training process.
|
80
|
-
|
81
|
-
For more information check: https://fasttext.cc/docs/en/options.html
|
82
|
-
"""
|
83
|
-
|
84
|
-
return fasttext.train_unsupervised(
|
85
|
-
path,
|
86
|
-
**kwargs
|
87
|
-
)
|
88
|
-
|
89
|
-
@requires(stages=["start"])
|
90
|
-
@workflow(input_arg="path", input_process="_start", output_process="_predict_model", next_stage="predict_model")
|
91
|
-
def load_model(self, *, path: str) -> Word2Vec | FastText:
|
92
|
-
"""
|
93
|
-
Loads the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.
|
94
|
-
|
95
|
-
:param path: Path to the saved model file.
|
96
|
-
"""
|
97
|
-
|
98
|
-
extension: str = Path(path).suffix.lower()
|
99
|
-
|
100
|
-
if extension == ".model":
|
101
|
-
return Word2Vec.load(path)
|
102
|
-
elif extension == ".bin":
|
103
|
-
return fasttext.load_model(path)
|
104
|
-
else:
|
105
|
-
raise ValueError(f"Model extension {extension} not recognized.")
|
106
|
-
|
107
|
-
@requires(stages=["predict_model", "embeddings_operations"])
|
108
|
-
@workflow(input_arg="model", input_process="_predict_model", output_process="_end", next_stage="end")
|
109
|
-
def save_model(self, *, model: Word2Vec | FastText, path: str) -> str:
|
110
|
-
"""
|
111
|
-
Saves the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.
|
112
|
-
|
113
|
-
:param model: A trained Word2Vec or FastText model.
|
114
|
-
:param path: The path (without extension) where to save the model.
|
115
|
-
"""
|
116
|
-
full_path: Path = Path(path)
|
117
|
-
|
118
|
-
if isinstance(model, Word2Vec):
|
119
|
-
model.save(str(full_path.with_suffix(".model")))
|
120
|
-
elif isinstance(model, FastText):
|
121
|
-
model.save_model(str(full_path.with_suffix(".bin")))
|
122
|
-
else:
|
123
|
-
raise NotImplementedError(f"Model identifier type {type(model.__name__)} not yet supported")
|
124
|
-
|
125
|
-
return str(full_path.resolve())
|
126
|
-
|
127
|
-
@requires(stages=["predict_model", "embeddings_operations"], multiple_calls=True)
|
128
|
-
@workflow(input_arg="model", input_process="_predict_model", output_process="_embeddings_operations", next_stage="embeddings_operations")
|
129
|
-
def nearest_neighbors(self, *, model: Word2Vec | FastText, word: str, n_neighbors: int) -> list[tuple[str,float]]:
|
130
|
-
"""
|
131
|
-
Returns the top N most semantically similar words to a given target word.
|
132
|
-
|
133
|
-
:param model: A trained Word2Vec or FastText model.
|
134
|
-
:param word: The target word to find neighbors for.
|
135
|
-
:param n_neighbors: Number of nearest neighbors to return.
|
136
|
-
"""
|
137
|
-
|
138
|
-
if isinstance(model, Word2Vec):
|
139
|
-
return model.wv.most_similar(word, topn=n_neighbors)
|
140
|
-
elif isinstance(model, FastText):
|
141
|
-
result: list[tuple[float, str]] = model.get_nearest_neighbors(word, k=n_neighbors)
|
142
|
-
return [(word, score) for score, word in result]
|
143
|
-
else:
|
144
|
-
raise NotImplementedError(f"Model identifier type {type(model.__name__)} not yet supported")
|
145
|
-
|
146
|
-
@requires(stages=["predict_model", "embeddings_operations"], multiple_calls=True)
|
147
|
-
@workflow(input_arg="model", input_process="_predict_model", output_process="_embeddings_operations", next_stage="embeddings_operations")
|
148
|
-
def cos_similarity(self, *, model: Word2Vec | FastText, first_word: str, second_word: str) -> float:
|
149
|
-
"""
|
150
|
-
Computes the cosine similarity between the embeddings of two words.
|
151
|
-
|
152
|
-
:param model: A trained Word2Vec or FastText model.
|
153
|
-
:param first_word: The first word in the pair.
|
154
|
-
:param second_word: The second word in the pair.
|
155
|
-
"""
|
156
|
-
|
157
|
-
if isinstance(model, Word2Vec):
|
158
|
-
return float(model.wv.similarity(first_word, second_word))
|
159
|
-
elif isinstance(model, FastText):
|
160
|
-
first_word_vector: ndarray = np.array(model[first_word]).reshape(1, -1)
|
161
|
-
second_word_vector: ndarray = np.array(model[second_word]).reshape(1, -1)
|
162
|
-
|
163
|
-
cos_sim: ndarray = cosine_similarity(first_word_vector, second_word_vector)
|
164
|
-
|
165
|
-
return float(cos_sim[0][0])
|
166
|
-
else:
|
167
|
-
raise NotImplementedError(f"Model identifier type {model.__name__} not yet supported")
|
168
|
-
|
169
|
-
@requires(stages=["predict_model"])
|
170
|
-
@workflow(input_arg="model", input_process="_predict_model", output_process="_end", next_stage="end")
|
171
|
-
def to_matrix(self, *, model: Word2Vec | FastText) -> MatrixRepresentation:
|
172
|
-
"""
|
173
|
-
Returns a matrix represenation of the word embeddings and their associated labels.
|
174
|
-
|
175
|
-
:param model: A trained Word2Vec or FastText model.
|
176
|
-
"""
|
177
|
-
|
178
|
-
word_vectors: ndarray
|
179
|
-
labels: list[str]
|
180
|
-
|
181
|
-
if isinstance(model, Word2Vec):
|
182
|
-
word_vectors = model.wv[model.wv.index_to_key]
|
183
|
-
labels = list(model.wv.index_to_key)
|
184
|
-
elif isinstance(model, FastText):
|
185
|
-
words: list[str] = model.words
|
186
|
-
word_vectors = np.array([model[word] for word in words])
|
187
|
-
labels = words
|
188
|
-
|
189
|
-
return MatrixRepresentation(matrix=word_vectors, rows=labels)
|
dstk/text_matrix_builder.py
DELETED
@@ -1,87 +0,0 @@
|
|
1
|
-
import pandas as pd
|
2
|
-
from sklearn.feature_extraction.text import CountVectorizer
|
3
|
-
import numpy as np
|
4
|
-
from .workflow_tools import requires, workflow, WorkflowManager
|
5
|
-
from .matrix_base import MatrixRepresentation, matrix_to_dataframe
|
6
|
-
|
7
|
-
from .lib_types import csr_matrix, ndarray, DataFrame, csc_matrix, Matrix, Labels
|
8
|
-
|
9
|
-
STAGES = [
|
10
|
-
"start", # Before any processing
|
11
|
-
"matrix_operations" # Matrix operations and transformations
|
12
|
-
"end" # Dataframe operations
|
13
|
-
]
|
14
|
-
|
15
|
-
class TextMatrixBuilder(WorkflowManager):
|
16
|
-
"""
|
17
|
-
Creates a Document Term Matrix, a Co-ocurrence Matrix, and dataframes from them.
|
18
|
-
|
19
|
-
:param corpus: A list of sentences or collocations from which to build a matrix.
|
20
|
-
"""
|
21
|
-
|
22
|
-
_start: list[str]
|
23
|
-
_end: DataFrame
|
24
|
-
|
25
|
-
def __init__(self, corpus: list[str] | None = None):
|
26
|
-
"""
|
27
|
-
Initializes TextMatrixBuilder with given attributes.
|
28
|
-
"""
|
29
|
-
|
30
|
-
super().__init__()
|
31
|
-
|
32
|
-
self._matrix_operations: MatrixRepresentation
|
33
|
-
|
34
|
-
self._set_workflow(input_arg=corpus)
|
35
|
-
|
36
|
-
@requires(stages=["start"])
|
37
|
-
@workflow(input_arg="corpus", input_process="_start", output_process="_matrix_operations", next_stage="matrix_operations")
|
38
|
-
def create_dtm(self, *, corpus: list[str], **kwargs) -> MatrixRepresentation:
|
39
|
-
"""
|
40
|
-
Creates Document Term Matrix (DTM).
|
41
|
-
|
42
|
-
:param corpus: A list of sentences or collocations from which to build a matrix.
|
43
|
-
:param kwargs: Additional keyword arguments to pass to sklearn's CountVectorizer.
|
44
|
-
Common options include:
|
45
|
-
- stop_words: If provided, a list of stopwords to remove from the corpus.
|
46
|
-
- ngram_range: A tuple (min_n, max_n) specifying the range of n-grams to consider.
|
47
|
-
For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
|
48
|
-
"""
|
49
|
-
|
50
|
-
vectorizer: CountVectorizer = CountVectorizer(**kwargs)
|
51
|
-
|
52
|
-
dtm: csr_matrix = vectorizer.fit_transform(corpus)
|
53
|
-
|
54
|
-
return MatrixRepresentation(
|
55
|
-
matrix=dtm,
|
56
|
-
rows=np.array(corpus),
|
57
|
-
columns=vectorizer.get_feature_names_out()
|
58
|
-
)
|
59
|
-
|
60
|
-
@requires(stages=["matrix_operations"])
|
61
|
-
@workflow(input_arg="matrix", input_attrs={"matrix": "matrix", "rows": "columns", "columns": "columns"}, input_process="_matrix_operations", output_process="_matrix_operations")
|
62
|
-
def create_co_occurrence_matrix(self, *, matrix: csr_matrix, rows: Labels = None, columns: Labels = None) -> csc_matrix:
|
63
|
-
"""
|
64
|
-
Creates a Co-occurrence matrix.
|
65
|
-
|
66
|
-
:param matrix: A Document Term Matrix (DTM) from which o build a Co-occurrence matrix.
|
67
|
-
"""
|
68
|
-
|
69
|
-
co_matrix = matrix.T @ matrix
|
70
|
-
|
71
|
-
return MatrixRepresentation(
|
72
|
-
matrix=co_matrix,
|
73
|
-
rows=rows,
|
74
|
-
columns=columns
|
75
|
-
)
|
76
|
-
|
77
|
-
@requires(stages=["matrix_operations"])
|
78
|
-
@workflow(input_arg="matrix", input_process="_matrix_operations", output_process="_end", next_stage="end")
|
79
|
-
def to_dataframe(self, *, matrix: MatrixRepresentation, **kwargs) -> DataFrame:
|
80
|
-
"""
|
81
|
-
Creates a dataframe from a matrix representation.
|
82
|
-
|
83
|
-
:param matrix: A matrix representation from which to create a dataframe.
|
84
|
-
:param kwargs: Additional keyword arguments to pass to sklearn's pandas' DataFrame.
|
85
|
-
"""
|
86
|
-
|
87
|
-
return matrix_to_dataframe(matrix=matrix, **kwargs)
|