dstklib 1.0.2__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstk/__init__.py +10 -12
- dstk/adaptors/__init__.py +2 -0
- dstk/adaptors/adaptors.py +91 -0
- dstk/adaptors/typeguards.py +141 -0
- dstk/hooks/__init__.py +2 -0
- dstk/hooks/hook_tools.py +89 -0
- dstk/hooks/type_conversion.py +40 -0
- dstk/lib_types/__init__.py +2 -3
- dstk/lib_types/dstk_types.py +188 -16
- dstk/lib_types/plotly_types.py +1 -0
- dstk/method_index.py +32 -0
- dstk/models/__init__.py +2 -0
- dstk/models/model_tools.py +83 -0
- dstk/models/models.py +191 -0
- dstk/modules/__init__.py +10 -0
- dstk/modules/count_models.py +91 -0
- dstk/modules/data_visualization/__init__.py +2 -0
- dstk/modules/data_visualization/clustering.py +129 -0
- dstk/modules/data_visualization/embeddings.py +101 -0
- dstk/modules/geometric_distance.py +114 -0
- dstk/modules/ngrams.py +156 -0
- dstk/modules/predict_models.py +109 -0
- dstk/modules/text_matrix_builder.py +55 -0
- dstk/modules/text_processor.py +100 -0
- dstk/modules/tokenizer.py +139 -0
- dstk/modules/weight_matrix.py +65 -0
- dstk/templates/__init__.py +2 -0
- dstk/templates/rules.py +59 -0
- dstk/templates/templates.py +231 -0
- dstk/workflows/__init__.py +2 -0
- dstk/workflows/stage_workflows.py +55 -0
- dstk/workflows/workflow_tools.py +383 -0
- dstklib-2.0.0.dist-info/METADATA +377 -0
- dstklib-2.0.0.dist-info/RECORD +43 -0
- dstk/collocations.py +0 -121
- dstk/count_models.py +0 -112
- dstk/geometric_distance.py +0 -107
- dstk/lib_types/matplotlib_types.py +0 -4
- dstk/lib_types/nltk_types.py +0 -1
- dstk/matrix_base.py +0 -113
- dstk/pipeline_tools.py +0 -27
- dstk/pipelines.py +0 -114
- dstk/plot_embeddings.py +0 -240
- dstk/predict_models.py +0 -189
- dstk/text_matrix_builder.py +0 -87
- dstk/text_processor.py +0 -450
- dstk/weight_matrix.py +0 -71
- dstk/workflow_tools.py +0 -257
- dstklib-1.0.2.dist-info/METADATA +0 -369
- dstklib-1.0.2.dist-info/RECORD +0 -28
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/LICENSE +0 -0
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/WHEEL +0 -0
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,377 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: dstklib
|
3
|
+
Version: 2.0.0
|
4
|
+
Requires-Python: <3.12
|
5
|
+
Description-Content-Type: text/markdown
|
6
|
+
License-File: LICENSE
|
7
|
+
Requires-Dist: spacy
|
8
|
+
Requires-Dist: plotly
|
9
|
+
Requires-Dist: scikit-learn
|
10
|
+
Requires-Dist: pandas
|
11
|
+
Requires-Dist: numpy
|
12
|
+
Requires-Dist: gensim
|
13
|
+
Requires-Dist: fasttext
|
14
|
+
Requires-Dist: kneed
|
15
|
+
Requires-Dist: umap-learn
|
16
|
+
|
17
|
+
# Distributional Semantics Toolkit
|
18
|
+
|
19
|
+
This library is based on the book *Distributional Semantics* by Alessandro Lenci and Magnus Sahlgren. It attempts to incorporate some of the algorithms described in the book, commonly used in distributional semantics.
|
20
|
+
|
21
|
+
## Table of Contents
|
22
|
+
|
23
|
+
1. [Documentation](#documentation)
|
24
|
+
2. [Installation](#installation)
|
25
|
+
3. [Usage](#usage)
|
26
|
+
5. [Contributing](#contributing)
|
27
|
+
6. [License](#license)
|
28
|
+
|
29
|
+
## Documentation
|
30
|
+
|
31
|
+
You can find the (temporal) basic documentation [here](https://distributional-semantics-toolkit-30269b.gitlab.io/index.html).
|
32
|
+
|
33
|
+
## Installation
|
34
|
+
|
35
|
+
To install it just run the command:
|
36
|
+
|
37
|
+
```bash
|
38
|
+
pip install dstklib
|
39
|
+
```
|
40
|
+
|
41
|
+
DSTK requires python <3.12 to work.
|
42
|
+
|
43
|
+
# Usage
|
44
|
+
|
45
|
+
The library can be used in three modes:
|
46
|
+
|
47
|
+
## Standalone mode
|
48
|
+
|
49
|
+
In standalone mode you can use the methods individually. Just import the method you want to use from its repective module in the `dstk.modules` folder:
|
50
|
+
|
51
|
+
```python
|
52
|
+
from dstk.modules.text_processor import to_lower
|
53
|
+
|
54
|
+
words = ["The", "Quick", "Brown", "Fox", "Jumps", "Over", "The", "Lazy", "Dog"]
|
55
|
+
|
56
|
+
lower_tokens = to_lower(words=words)
|
57
|
+
|
58
|
+
print(lower_tokens)
|
59
|
+
|
60
|
+
# Output: ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
|
61
|
+
```
|
62
|
+
|
63
|
+
## Workflow mode
|
64
|
+
|
65
|
+
If there is a specific workflow you use multiple times, you can automate it by using WorkflowBuilder. Just input the name of the methods (in the correct order) you want to use and their corresponding arguments as a list of dictionaries, along with the name of the module you are importing them from:
|
66
|
+
|
67
|
+
```python
|
68
|
+
from dstk.workflows.workflow_tools import WorkflowBuilder
|
69
|
+
|
70
|
+
text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
|
71
|
+
model = "my_spacy_model"
|
72
|
+
|
73
|
+
TokenizerWorkflow = WorkflowBuilder(
|
74
|
+
name="TokenizerWorkflow",
|
75
|
+
module_name="tokenizer"
|
76
|
+
workflow=[
|
77
|
+
{"apply_model": {"model": model}},
|
78
|
+
{"get_tokens": {}},
|
79
|
+
{"remove_stop_words": {}}
|
80
|
+
]
|
81
|
+
)
|
82
|
+
|
83
|
+
# Pass as an argument the input data:
|
84
|
+
tokens = TokenizerWorkflow(input_data=text)
|
85
|
+
|
86
|
+
print(tokens)
|
87
|
+
|
88
|
+
# Output: [quick, brown, fox, jumps, lazy, dog, sun, sets, behind, hills]
|
89
|
+
```
|
90
|
+
|
91
|
+
You can also get specic results in the workflow or even all of them by using `return_methods` and `return all`:
|
92
|
+
|
93
|
+
```python
|
94
|
+
from dstk.workflows.workflow_tools import WorkflowBuilder
|
95
|
+
|
96
|
+
text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
|
97
|
+
model = "my_spacy_model"
|
98
|
+
|
99
|
+
TokenizerWorkflow = WorkflowBuilder(
|
100
|
+
name="TokenizerWorkflow",
|
101
|
+
module_name="tokenizer"
|
102
|
+
workflow=[
|
103
|
+
{"apply_model": {"model": model}},
|
104
|
+
{"get_tokens": {}},
|
105
|
+
{"remove_stop_words": {}}
|
106
|
+
]
|
107
|
+
)
|
108
|
+
|
109
|
+
model, filtered_tokens = TokenizerWorkflow(
|
110
|
+
input_data=text,
|
111
|
+
return_methods=["apply_model", "remove_stop_words"]
|
112
|
+
)
|
113
|
+
|
114
|
+
print(model)
|
115
|
+
print(filtered_tokens)
|
116
|
+
|
117
|
+
# Output:
|
118
|
+
# The quick brown fox jumps over the lazy dog while the sun sets behind the hills.
|
119
|
+
# [quick, brown, fox, jumps, lazy, dog, sun, sets, behind, hills]
|
120
|
+
```
|
121
|
+
|
122
|
+
If you choose to return all of the results, the workflow will return a generator with a tuple containing the name of the method and its result:
|
123
|
+
|
124
|
+
```python
|
125
|
+
from dstk.workflows.workflow_tools import WorkflowBuilder
|
126
|
+
|
127
|
+
text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
|
128
|
+
model = "my_spacy_model"
|
129
|
+
|
130
|
+
TokenizerWorkflow = WorkflowBuilder(
|
131
|
+
name="TokenizerWorkflow",
|
132
|
+
module_name="tokenizer"
|
133
|
+
workflow=[
|
134
|
+
{"apply_model": {"model": model}},
|
135
|
+
{"get_tokens": {}},
|
136
|
+
{"remove_stop_words": {}}
|
137
|
+
]
|
138
|
+
)
|
139
|
+
|
140
|
+
result = TokenizerWorkflow(
|
141
|
+
input_data=text,
|
142
|
+
return_all=True
|
143
|
+
)
|
144
|
+
|
145
|
+
print(next(result))
|
146
|
+
print(next(result))
|
147
|
+
print(next(result))
|
148
|
+
|
149
|
+
# Output:
|
150
|
+
# ("apply_model", The quick brown fox jumps over the lazy dog while the sun sets behind the hills.)
|
151
|
+
# ("get_tokens", [The, quick, brown, fox, jumps, over, the, lazy, dog, while, the, sun, sets, behind, the, hills])
|
152
|
+
# ("remove_stop_words", [quick, brown, fox, jumps, lazy, dog, sun, sets, behind, hills])
|
153
|
+
```
|
154
|
+
|
155
|
+
You can also make a workflow return a Wrapper class containing methods you might want to used multiple times. For example, if you wish to calculat the `cos_similarity` of different words in the same embeddings, you can do:
|
156
|
+
|
157
|
+
```python
|
158
|
+
from dstk.workflows.workflow_tools import WorkflowBuilder
|
159
|
+
|
160
|
+
GeometricDistance = WorkflowBuilder(
|
161
|
+
name="GeometricDistance",
|
162
|
+
module_name="geometric_distance",
|
163
|
+
workflow=[
|
164
|
+
{"cos_similarity": {}}, # The methods should NOT have args
|
165
|
+
],
|
166
|
+
wrapper=True
|
167
|
+
)
|
168
|
+
|
169
|
+
result = GeometricDistance(input_data=embeddings) # In this example 'embeddings' is a pandas DataFrame containing word embeddings
|
170
|
+
|
171
|
+
print(result.cos_similarity(first_word="word1", second_word="word2"))
|
172
|
+
print(result.cos_similarity(first_word="word3", second_word="word4"))
|
173
|
+
|
174
|
+
# Output:
|
175
|
+
# 0.999999
|
176
|
+
# 0.854456
|
177
|
+
```
|
178
|
+
|
179
|
+
### Templates
|
180
|
+
|
181
|
+
Workflows can use templates. Templates are a way to enforce certain rules on the workflow, such as method invocation order or restricting certain methods after others have been called, in order to achieve type safety and minimize errors. To use them just import the template of your respective module and pass it during workflow definition:
|
182
|
+
|
183
|
+
```python
|
184
|
+
from dstk.workflows.workflow_tools import WorkflowBuilder
|
185
|
+
from dstk.templates import TokenizerTemplate
|
186
|
+
|
187
|
+
text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
|
188
|
+
model = "my_spacy_model"
|
189
|
+
|
190
|
+
TokenizerWorkflow = WorkflowBuilder(
|
191
|
+
name="TokenizerWorkflow",
|
192
|
+
module_name="tokenizer",
|
193
|
+
template=TokenizerTemplate,
|
194
|
+
workflow=[
|
195
|
+
{"get_tokens": {}},
|
196
|
+
{"apply_model": {"model": model}}, # Wrong order. 'apply_model' should go first
|
197
|
+
{"remove_stop_words": {}}
|
198
|
+
]
|
199
|
+
)
|
200
|
+
|
201
|
+
tokens = TokenizerWorkflow(input_data=text)
|
202
|
+
|
203
|
+
# Output: RuntimeError: The method on step select_model must be ['apply_model']. Instead, got method get_tokens
|
204
|
+
```
|
205
|
+
|
206
|
+
### Stage workflows
|
207
|
+
|
208
|
+
Some common tasks in distributional semantics require the use of more than one module, such as text pre-processing (tokenization, text procesing and ngram extraction) or plotting embeddings (clustering and drawing the plot). Stage workflows exist to address that problem. They come with predefined templates about which modules should be used, their order and the methods that can be used (or not) after certain choices have been made. In order to use them just import the desired stage workflow and pass a StageWorkflow, which is a dictionary that includes the name of the modules to be used and its respective workflows:
|
209
|
+
|
210
|
+
```python
|
211
|
+
from dstk.workflows.workflow_tools import WorkflowBuilder
|
212
|
+
|
213
|
+
text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
|
214
|
+
model = "my_spacy_model"
|
215
|
+
|
216
|
+
TextProcessorWorkflow = TextProcessing(
|
217
|
+
name="TextProcessorWorkflow",
|
218
|
+
workflows={
|
219
|
+
"tokenizer": [
|
220
|
+
{"apply_model": {"model": model}},
|
221
|
+
{"get_tokens": {}},
|
222
|
+
{"remove_stop_words": {"custom_stop_words": stops}},
|
223
|
+
],
|
224
|
+
"ngrams": [
|
225
|
+
{"extract_ngrams": {"window_size": 3}}
|
226
|
+
],
|
227
|
+
"text_processor": [
|
228
|
+
{"tokens_to_text": {"lemmatize": True}},
|
229
|
+
{"to_lower": {}},
|
230
|
+
{"join": {}}
|
231
|
+
]
|
232
|
+
|
233
|
+
}
|
234
|
+
)
|
235
|
+
|
236
|
+
tokens = TokenizerWorkflow(input_data=text)
|
237
|
+
|
238
|
+
|
239
|
+
# Output: ['the quick brown', 'quick brown fox', 'brown fox jumps', 'fox jumps over', 'jumps over the', 'over the lazy', 'the lazy dog', 'lazy dog while', 'dog while the', 'while the sun', 'the sun set', 'sun set behind', 'set behind the', 'behind the hills']
|
240
|
+
```
|
241
|
+
|
242
|
+
Just like in the case of WorkflowBuilder, you can also return the result of specific modules or all of them. You just need to pass a list with the names of the modules you want to return to `return_modules` or `True` to `return_all`. In the last case, it will return a generator with tuples containing the name of the module and its respective result.
|
243
|
+
|
244
|
+
## Models
|
245
|
+
|
246
|
+
A model is just a set of workflows running one after another. If there are a lot of workflows that you constantly use, you can automate the process by using ModelBuilder. Just pass your workflows as a list:
|
247
|
+
|
248
|
+
```python
|
249
|
+
from dstk.workflows.workflow_tools import WorkflowBuilder
|
250
|
+
from dstk.models.model_tools import ModelBuilder
|
251
|
+
|
252
|
+
text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
|
253
|
+
|
254
|
+
TokenizerWorkflow = WorkflowBuilder(
|
255
|
+
name="TokenizerWorkflow",
|
256
|
+
module_name="tokenizer",
|
257
|
+
workflow=[
|
258
|
+
{"apply_model": {"model": model}},
|
259
|
+
{"get_tokens": {}},
|
260
|
+
{"remove_stop_words": {}}
|
261
|
+
]
|
262
|
+
)
|
263
|
+
|
264
|
+
TextWorkflow = WorkflowBuilder(
|
265
|
+
name="TextWorkflow",
|
266
|
+
module_name="text_processor",
|
267
|
+
workflow=[
|
268
|
+
{"tokens_to_text": {}}
|
269
|
+
{"to_lower": {}}
|
270
|
+
]
|
271
|
+
)
|
272
|
+
|
273
|
+
CustomModel = ModelBuilder(
|
274
|
+
workflows=[
|
275
|
+
TokenizerWorkflow,
|
276
|
+
TextWorkflow
|
277
|
+
]
|
278
|
+
)
|
279
|
+
|
280
|
+
# Pass as an argument your input data
|
281
|
+
result = CustomModel(input_data=text)
|
282
|
+
|
283
|
+
print(result)
|
284
|
+
|
285
|
+
# Output: ["quick", "brown", "fox", "jumps", "lazy", "dog", "sun", "sets", "behind", "hills"]
|
286
|
+
```
|
287
|
+
|
288
|
+
Just like in the case of WorkflowBuilder and StageWorkflows, you can also return the result of specific workflows or all of them. You just need to pass a list with the names of the workflows (or hooks) you want to return to `return_workflows` or `True` to `return_all`. In the last case, it will return a generator with tuples containing the name of the workflows and its respective result.
|
289
|
+
|
290
|
+
### Hooks
|
291
|
+
|
292
|
+
You can add hooks (functions with custom logic) to a pipeline. You must only follow two rules:
|
293
|
+
|
294
|
+
1. The function must accept only one input and return one output
|
295
|
+
2. The type of its input must be the same as the one returned from the previous workflow. Also, the type it returns must match the input of the next workflow.
|
296
|
+
|
297
|
+
Following these rules you can insert your custom hooks this way:
|
298
|
+
|
299
|
+
```python
|
300
|
+
from dstk.workflows.workflow_tools import WorkflowBuilder
|
301
|
+
from dstk.models.model_tools import ModelBuilder
|
302
|
+
from dstk.hooks.hook_tools import Hook
|
303
|
+
|
304
|
+
text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
|
305
|
+
|
306
|
+
TokenizerWorkflow = WorkflowBuilder(
|
307
|
+
name="TokenizerWorkflow",
|
308
|
+
module_name="tokenizer",
|
309
|
+
workflow=[
|
310
|
+
{"apply_model": {"model": model}},
|
311
|
+
{"get_tokens": {}},
|
312
|
+
{"remove_stop_words": {}}
|
313
|
+
]
|
314
|
+
)
|
315
|
+
|
316
|
+
TextWorkflow = WorkflowBuilder(
|
317
|
+
name="TextWorkflow",
|
318
|
+
module_name="text_processor",
|
319
|
+
workflow=[
|
320
|
+
{"tokens_to_text": {}}
|
321
|
+
{"to_lower": {}}
|
322
|
+
]
|
323
|
+
)
|
324
|
+
|
325
|
+
def custom_hook(words):
|
326
|
+
return [word + "_hook" for word in words]
|
327
|
+
|
328
|
+
CustomHook = Hook(name="CustomHook", method=custom_hook)
|
329
|
+
|
330
|
+
CustomModel = ModelBuilder(
|
331
|
+
workflows=[
|
332
|
+
TokenizerWorkflow,
|
333
|
+
TextWorkflow,
|
334
|
+
CustomHook
|
335
|
+
]
|
336
|
+
)
|
337
|
+
|
338
|
+
result = CustomPipeline(input_data=text)
|
339
|
+
|
340
|
+
print(result)
|
341
|
+
|
342
|
+
# Output: ["quick_hook", "brown_hook", "fox_hook", "jumps_hook", "lazy_hook", "dog_hook", "sun_hook", "sets_hook", "behind_hook", "hills_hook"]
|
343
|
+
```
|
344
|
+
|
345
|
+
### Predefined Models
|
346
|
+
|
347
|
+
DSTK has some models included that already cover most of the frequent tasks in distributional semantics:
|
348
|
+
|
349
|
+
- *StandardModel*: This pipeline generates word embeddings using the standard model as defined by (Lenci & Sahlgren 97). It preprocesses the text by removing stop words, lowering the words and segmenting the text using a context window. The co-occurrence matrix is weighted with PPMI and reduced with truncated SVD.
|
350
|
+
|
351
|
+
- *SGNSModel*: This pipeline generates word embeddings using Skip-Gram with Negative Sampling (SGNS) as defined by (Lenci & Sahlgren 162). It preprocesses the text by extracting the sentences, removing stop words and lowering them. The embeddings are extracted by using word2vec to do SGNS. Returns an instance of PredictModels.
|
352
|
+
|
353
|
+
In order to use them just do:
|
354
|
+
|
355
|
+
```python
|
356
|
+
from dstk.models.models import StandardModel
|
357
|
+
|
358
|
+
text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
|
359
|
+
|
360
|
+
result = StandardModel(text=text, model=model, n_components=13)
|
361
|
+
|
362
|
+
print(result.cos_similarity(first_word="brown", second_word="fox")) # It returns a wrapper with the methods 'cos_similarity' and 'nearest_neighbors' included.
|
363
|
+
|
364
|
+
# Output: 0.5858588775335568
|
365
|
+
```
|
366
|
+
|
367
|
+
# Contributing
|
368
|
+
|
369
|
+
I welcome contributions to improve this toolkit. If you have ideas or fixes, feel free to fork the repository and submit a pull request. Here are some ways you can help:
|
370
|
+
|
371
|
+
* Report bugs or issues.
|
372
|
+
|
373
|
+
* Suggest new features or algorithms to add.
|
374
|
+
|
375
|
+
# License
|
376
|
+
|
377
|
+
This project is licensed under the GPL-3 License - see the [LICENSE](https://gitlab.com/CesarACabrera/distributional-semantics-toolkit/-/blob/master/LICENSE?ref_type=heads) file for details.
|
@@ -0,0 +1,43 @@
|
|
1
|
+
dstk/__init__.py,sha256=zxRGhDnwApMqeLfJ3-ljlliJl3Nve_f2JYOXVw5hx_A,168
|
2
|
+
dstk/method_index.py,sha256=7UoUPn_PZ3U10WkXFciqEp_XN5xEy4Wy4Wyxe6a__WQ,791
|
3
|
+
dstk/adaptors/__init__.py,sha256=b6asMMPwGZlgFdvrjwlTqbkItB00_4FFdh63-tRMQjs,49
|
4
|
+
dstk/adaptors/adaptors.py,sha256=t-BbxHYtHyB9-hxtbQTcMLrYmZ_Kov4WwSoGcilZMmE,4057
|
5
|
+
dstk/adaptors/typeguards.py,sha256=Ev5Z_dppKRlgNVho07QMfCkmxY800uL2auxrfGRubd4,4589
|
6
|
+
dstk/hooks/__init__.py,sha256=Z8rT2WV-hlhbEUGrPfO71sdpWPf69qu8tSgF6MEhVZo,56
|
7
|
+
dstk/hooks/hook_tools.py,sha256=H0pJ5ADJeLtGTacNK39WABO5Hr65HfhTFLzFjtI-3R0,3275
|
8
|
+
dstk/hooks/type_conversion.py,sha256=OKyAiCiOGecboy5FGLT43HmnltpAYtDY_LpOPcx5S2k,1395
|
9
|
+
dstk/lib_types/__init__.py,sha256=L64qlAbqNavw6fopO467i6r0jOJGSL1d0VsGm35lPWU,222
|
10
|
+
dstk/lib_types/dstk_types.py,sha256=vqqvec3eSKO8ozg-aDM8aiWWPXt0qQxuhGXiZzm2RHE,6750
|
11
|
+
dstk/lib_types/fasttext_types.py,sha256=5LXE77kgCPJHRx0zXlLTs7wRIQOGZiz30Pq0trIXcBA,51
|
12
|
+
dstk/lib_types/gensim_types.py,sha256=tg3OASG_EWuqFQw_pKM4HNjRk1yrMnmlBqdKm-orxag,34
|
13
|
+
dstk/lib_types/numpy_types.py,sha256=zxgVrHcRJ-_NGO3LE1aba0d4JQDLYN26us5ljlhIq7E,64
|
14
|
+
dstk/lib_types/pandas_types.py,sha256=bR27h-xyZ3FccROIHxqYpVvqMNoi1bvIzpq25cf8kkg,43
|
15
|
+
dstk/lib_types/plotly_types.py,sha256=44_qvw7RNhYqq-BvdBALGR_sWTwQRSip7HjM--fkPGQ,39
|
16
|
+
dstk/lib_types/sklearn_types.py,sha256=W59yIEkZM_E_tW061x1bY-LpRC2aCzLgtYmXANNSN3Q,47
|
17
|
+
dstk/lib_types/spacy_types.py,sha256=hUiaw4AywSW8o42h5lp3t6a4yosG_GasdJX2RCKgW7o,125
|
18
|
+
dstk/models/__init__.py,sha256=27_6lGFl0BAZp3rgXnbazEX_uxK9jQ0lf3RkjomzICA,48
|
19
|
+
dstk/models/model_tools.py,sha256=IYTG4I1oNhnW3JX4bzXxgBa8phhvO7l0haTxeVMeYhM,3279
|
20
|
+
dstk/models/models.py,sha256=wLd_BOP4hxWnCg7PCDOO-ZZ3Yx8jY3MqTLbvbhQqx2M,8152
|
21
|
+
dstk/modules/__init__.py,sha256=NKI_C2D86x25lUmcM2r6e0M31rV1-8FZBFHRP2oXF2Q,267
|
22
|
+
dstk/modules/count_models.py,sha256=VY8YRao78pmh_3WBX8zWQ94B13J1cSL6Pg91c3qUUyg,4288
|
23
|
+
dstk/modules/geometric_distance.py,sha256=Zj9O0p6glJOqcNGkdjP_qucj_en-QeejbbQ8Hr5oUVM,4621
|
24
|
+
dstk/modules/ngrams.py,sha256=WZ8Nr05KeBnOH_AnPLfUrBQJkcz1hJxJRwVuIU_qviI,6730
|
25
|
+
dstk/modules/predict_models.py,sha256=6Wz75nG7JqtI4fCvidDrYQmEEuQyZ3aQxLv66zq0mI4,3987
|
26
|
+
dstk/modules/text_matrix_builder.py,sha256=jAla8fKiKZ8cPHUmdW9V0B1MRGU488JKl4s9h45zLBc,2318
|
27
|
+
dstk/modules/text_processor.py,sha256=v-T8qPQwRgXIeZ5eJxRO3e7J-zD6UPzbpb-sRf94Vrs,3600
|
28
|
+
dstk/modules/tokenizer.py,sha256=bkALslqlUHzD83tjsI57U0joUIVRLZCdUlFi5TPaMb8,4323
|
29
|
+
dstk/modules/weight_matrix.py,sha256=-nP-2VY2YFjOoExeojajWA0fkxtj8GdgJU5QQHBQFyo,2694
|
30
|
+
dstk/modules/data_visualization/__init__.py,sha256=OSogy0-_DE7t89KS8Q6n1RNGToqoq2LnmKfLc6cjGOA,52
|
31
|
+
dstk/modules/data_visualization/clustering.py,sha256=tyUQtFayH7Jh4uisheaeWkIEn9X3fvBYT-sblfgpVws,4982
|
32
|
+
dstk/modules/data_visualization/embeddings.py,sha256=8lPMUB-saog0c3PWiHvuz1sC5YYkqFjn5wCJilnyBfQ,4237
|
33
|
+
dstk/templates/__init__.py,sha256=t7R-RKxq-EZukHV7U9zj7dbIrbxGKTIJyRzh-Tm1PiM,45
|
34
|
+
dstk/templates/rules.py,sha256=Rg8uUBwWlG0b6P-ObnahImb-ZOphMkXB5COWLRQgZSg,1757
|
35
|
+
dstk/templates/templates.py,sha256=i-RV0FJfGpeh1Ja8PxLJetYh2cXH7l-jsV-bmJcdOIU,6447
|
36
|
+
dstk/workflows/__init__.py,sha256=sP8X_nwRIquC4NHSyUZ5L8DbxZfUCtD8qBekykcBuW8,61
|
37
|
+
dstk/workflows/stage_workflows.py,sha256=LkNRgEHRLmpCNIXJJiUERu45trS3zGv2zC7hcIxUl_s,2639
|
38
|
+
dstk/workflows/workflow_tools.py,sha256=M1wcJ5UGeEKf6TSE0-Z2AA2CetmFUrgCGdV5Z4LvD4Y,16101
|
39
|
+
dstklib-2.0.0.dist-info/LICENSE,sha256=LpSgNPBfwn5F4CVhnTbhpiX2f0YgRMzGWQ7Sphuuwuc,35139
|
40
|
+
dstklib-2.0.0.dist-info/METADATA,sha256=gF35KRqRuMZTZRBJ_05rq8eUwameiBjX2sEafpi4WBU,12863
|
41
|
+
dstklib-2.0.0.dist-info/WHEEL,sha256=VyG4dJCdJcxE1baiVBm9NET3Nj7Wne1lZZq7UFNxRpg,97
|
42
|
+
dstklib-2.0.0.dist-info/top_level.txt,sha256=b_MNmKso0-ra2M7snsy5fZBW-l9MItjrwMYBd-tiOYo,5
|
43
|
+
dstklib-2.0.0.dist-info/RECORD,,
|
dstk/collocations.py
DELETED
@@ -1,121 +0,0 @@
|
|
1
|
-
from collections import Counter
|
2
|
-
from dataclasses import dataclass
|
3
|
-
import matplotlib.pyplot as plt
|
4
|
-
from .workflow_tools import workflow, requires, WorkflowManager
|
5
|
-
|
6
|
-
from .lib_types import BarContainer, Collocate
|
7
|
-
|
8
|
-
STAGES = [
|
9
|
-
"start", # Before any processing
|
10
|
-
"collocates", # Manipulation of collocates
|
11
|
-
"count", # Operations dealing with the counts of the words appearing around the target word
|
12
|
-
"end" # End of the workflow. After this stage the user must necessarily call result to continue with the analysis
|
13
|
-
]
|
14
|
-
|
15
|
-
@dataclass
|
16
|
-
class Ngrams:
|
17
|
-
collocates: list[tuple[str, ...]]
|
18
|
-
bigrams: list[Collocate]
|
19
|
-
|
20
|
-
class Collocations(WorkflowManager):
|
21
|
-
"""
|
22
|
-
Extracts n-grams for a target word by a context window (both directed or undirected). Counts and plots the terms that co-occur with the target
|
23
|
-
|
24
|
-
:param tokens: A list of tokenized words (e.g. from a text or corpus) in which to search for the target word and its collocates.
|
25
|
-
|
26
|
-
"""
|
27
|
-
|
28
|
-
_start: list[str]
|
29
|
-
_end: BarContainer
|
30
|
-
|
31
|
-
def __init__(self, tokens: list[str] | None = None):
|
32
|
-
"""
|
33
|
-
Initializes TargetCollocations with given attributes.
|
34
|
-
"""
|
35
|
-
|
36
|
-
super().__init__()
|
37
|
-
|
38
|
-
# Stages
|
39
|
-
|
40
|
-
self._collocates: Ngrams
|
41
|
-
self._count: Counter[str]
|
42
|
-
|
43
|
-
self._set_workflow(input_arg=tokens)
|
44
|
-
|
45
|
-
@requires(stages=["start"])
|
46
|
-
@workflow(input_arg="tokens", input_process="_start", output_process="_collocates", next_stage="collocates")
|
47
|
-
def extract_ngrams(self, *, tokens: list[str], target_word: str, window_size: tuple[int, int], directed: bool = False) -> Ngrams:
|
48
|
-
"""
|
49
|
-
Extracts both the context words of the target collocation, returned as tuples whose lenght corresponds to the specified window_size, and the collocations of the target word, in either directed or undirected manner.
|
50
|
-
|
51
|
-
:param tokens: A list of tokenized words (e.g. from a text or corpus) in which to search for the target word and its collocates.
|
52
|
-
:param target_word: Target word whose collocation are to be identified.
|
53
|
-
:param window_size: Context window represented as a tuple (left, right) of the number of words to be included to left and right of the target word.
|
54
|
-
:param directed: If True, the position of collocates relative to the target word is considered (i.e., direction matters); if False, direction is ignored. Defaults to False.
|
55
|
-
"""
|
56
|
-
|
57
|
-
collocates: list[tuple[str, ...]] = []
|
58
|
-
bigrams: list[Collocate] = []
|
59
|
-
|
60
|
-
for index, word in enumerate(tokens):
|
61
|
-
if word == target_word:
|
62
|
-
start: int = max(0, index - window_size[0])
|
63
|
-
end: int = min(len(tokens), index + window_size[1] + 1)
|
64
|
-
|
65
|
-
left_context: list[str] = tokens[start:index]
|
66
|
-
right_context: list[str] = tokens[index + 1:end]
|
67
|
-
|
68
|
-
context: list[str] = left_context + right_context
|
69
|
-
|
70
|
-
collocates.append(tuple(context))
|
71
|
-
|
72
|
-
if directed == True:
|
73
|
-
bigrams.extend([(word, ("L", target_word)) for word in left_context] + [(word, ("R", target_word)) for word in right_context])
|
74
|
-
else:
|
75
|
-
bigrams.extend([(word, target_word) for word in context])
|
76
|
-
|
77
|
-
return Ngrams(collocates, bigrams)
|
78
|
-
|
79
|
-
@requires(stages=["collocates"])
|
80
|
-
@workflow(input_arg="collocates", input_attrs={"collocates": "collocates"}, input_process="_collocates", output_process="_count", next_stage="count")
|
81
|
-
def count_collocates(self, *, collocates: list[tuple[str, ...]]) -> Counter[str]:
|
82
|
-
"""
|
83
|
-
Counts the collocates of the target word.
|
84
|
-
|
85
|
-
:param collocates: A list of collocates to count.
|
86
|
-
"""
|
87
|
-
|
88
|
-
all_words: list[str] = [word for collocation in collocates for word in collocation]
|
89
|
-
word_counts: Counter[str] = Counter(all_words)
|
90
|
-
|
91
|
-
return word_counts
|
92
|
-
|
93
|
-
@requires(stages=["count"])
|
94
|
-
@workflow(input_arg="word_counts", input_process="_count", output_process="_end", next_stage="end")
|
95
|
-
def plot(self, *, word_counts: Counter[str], size: int = 10, show: bool = True, path: str | None = None) -> BarContainer:
|
96
|
-
"""
|
97
|
-
Plots the count of the collocates.
|
98
|
-
|
99
|
-
:param word_counts: A Counter object with the counts of each word. Defaults to None.
|
100
|
-
:param size: The number of the most common collocates to plot. Defaults to 10.
|
101
|
-
:param show: If True, shows the plot. Defaults to True.
|
102
|
-
:param path: If provided, saves the plot in the specified path. Defaults to None.
|
103
|
-
"""
|
104
|
-
|
105
|
-
counts: list[tuple[str, int]] = word_counts.most_common(size)
|
106
|
-
|
107
|
-
words: tuple[str, ...]
|
108
|
-
values: tuple[int, ...]
|
109
|
-
words, values = zip(*counts)
|
110
|
-
|
111
|
-
fig: BarContainer = plt.bar(words, values)
|
112
|
-
plt.xlabel("Palabras")
|
113
|
-
plt.ylabel("Cuentas")
|
114
|
-
|
115
|
-
if path:
|
116
|
-
plt.savefig(path)
|
117
|
-
|
118
|
-
if show:
|
119
|
-
plt.show()
|
120
|
-
|
121
|
-
return fig
|
dstk/count_models.py
DELETED
@@ -1,112 +0,0 @@
|
|
1
|
-
from sklearn.preprocessing import StandardScaler
|
2
|
-
from sklearn.decomposition import PCA, TruncatedSVD
|
3
|
-
from .workflow_tools import requires, workflow, WorkflowManager
|
4
|
-
from .matrix_base import MatrixRepresentation, accept_matrix_representation, matrix_to_dataframe
|
5
|
-
|
6
|
-
from .lib_types import ndarray, DataFrame
|
7
|
-
|
8
|
-
STAGES = [
|
9
|
-
"start", # Before any transformation to the Co-Matrixis applied
|
10
|
-
"embeddings", # Result of the embeddings
|
11
|
-
"end" # Embeddings transformed to dataframe
|
12
|
-
]
|
13
|
-
|
14
|
-
class CountModels(WorkflowManager):
|
15
|
-
"""
|
16
|
-
Generates word embeddings using dimensionality reduction techniques on a co-occurrence matrix, such as SVD and PCA.
|
17
|
-
|
18
|
-
:param co_ocurrence_matrix: A Co-ocurrence matrix from which embeddings will be generated.
|
19
|
-
"""
|
20
|
-
|
21
|
-
_start: ndarray | DataFrame
|
22
|
-
_end: DataFrame
|
23
|
-
|
24
|
-
def __init__(self, co_ocurrence_matrix: DataFrame | None = None):
|
25
|
-
"""
|
26
|
-
Initializes CountModels with given attributes.
|
27
|
-
"""
|
28
|
-
super().__init__()
|
29
|
-
|
30
|
-
self._embeddings: MatrixRepresentation
|
31
|
-
|
32
|
-
self._set_workflow(input_arg=co_ocurrence_matrix)
|
33
|
-
|
34
|
-
@requires(stages=["start"])
|
35
|
-
@workflow(input_arg="matrix", input_process="_start", output_process="_embeddings", next_stage="embeddings")
|
36
|
-
@accept_matrix_representation()
|
37
|
-
def scale_matrix(self, *, matrix: ndarray | DataFrame, **kwargs) -> MatrixRepresentation:
|
38
|
-
"""
|
39
|
-
Scales the input matrix to have zero mean and unit variance for each feature.
|
40
|
-
|
41
|
-
This method applies standardization using scikit-learn's StandardScaler, which transforms the data such that each colum (feature) has a mean of 0 and a standard deviation of 1.
|
42
|
-
|
43
|
-
:param matrix: The input data to scale.
|
44
|
-
:param kwargs: Additional keyword arguments to pass to sklearn's StandardScaler. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
|
45
|
-
|
46
|
-
This method supports different matrix forms due to decorator-based preprocessing:
|
47
|
-
- matrix: ndarray or Dataframe
|
48
|
-
- matrix representation: MatrixRepresentation
|
49
|
-
"""
|
50
|
-
|
51
|
-
scaler: StandardScaler = StandardScaler(**kwargs)
|
52
|
-
scaled_matrix: ndarray = scaler.fit_transform(matrix)
|
53
|
-
|
54
|
-
return MatrixRepresentation(scaled_matrix, matrix.index if isinstance(matrix, DataFrame) else None, matrix.columns if isinstance(matrix, DataFrame) else None)
|
55
|
-
|
56
|
-
@requires(stages=["embeddings"])
|
57
|
-
@workflow(input_arg="matrix", input_process="_embeddings", output_process="_embeddings")
|
58
|
-
@accept_matrix_representation(override=("columns", None))
|
59
|
-
def svd_embeddings(self, *, matrix: ndarray | DataFrame, **kwargs) -> MatrixRepresentation:
|
60
|
-
"""
|
61
|
-
Generates word embeddings using truncated Single Value Descomposition (SVD).
|
62
|
-
|
63
|
-
:param matrix: A Co-occurrence matrix from which embeddings will be generated.
|
64
|
-
:param kwargs: Additional keyword arguments to pass to sklearn's TruncatedSVD.
|
65
|
-
Common options include:
|
66
|
-
- n_components: Specifies the number of dimensions to reduce the Co-ocurrence matrix to.
|
67
|
-
For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
|
68
|
-
|
69
|
-
This method supports different matrix forms due to decorator-based preprocessing:
|
70
|
-
- matrix: ndarray or Dataframe
|
71
|
-
- matrix representation: MatrixRepresentation
|
72
|
-
"""
|
73
|
-
|
74
|
-
svd: TruncatedSVD = TruncatedSVD(**kwargs)
|
75
|
-
embeddings: ndarray = svd.fit_transform(matrix)
|
76
|
-
|
77
|
-
return MatrixRepresentation(embeddings, matrix.index if isinstance(matrix, DataFrame) else None, None)
|
78
|
-
|
79
|
-
@requires(stages=["embeddings"])
|
80
|
-
@workflow(input_arg="matrix", input_process="_embeddings", output_process="_embeddings")
|
81
|
-
@accept_matrix_representation(override=("columns", None))
|
82
|
-
def pca_embeddings(self, *, matrix: ndarray | DataFrame, **kwargs) -> MatrixRepresentation:
|
83
|
-
"""
|
84
|
-
Generates word embeddings using Principal Component Analysis (PCA).
|
85
|
-
|
86
|
-
:param matrix: A Co-occurrence matrix from which embeddings will be generated.
|
87
|
-
:param kwargs: Additional keyword arguments to pass to sklearn's PCA.
|
88
|
-
Common options include:
|
89
|
-
- n_components: If an integer, specifies the number of dimensions to reduce the Co-ocurrence matrix to. If a float, the amount of variance to preserve during PCA.
|
90
|
-
For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
|
91
|
-
|
92
|
-
This method supports different matrix forms due to decorator-based preprocessing:
|
93
|
-
- matrix: ndarray or Dataframe
|
94
|
-
- matrix representation: MatrixRepresentation
|
95
|
-
"""
|
96
|
-
|
97
|
-
pca: PCA = PCA(**kwargs)
|
98
|
-
embeddings: ndarray = pca.fit_transform(matrix)
|
99
|
-
|
100
|
-
return MatrixRepresentation(embeddings, matrix.index if isinstance(matrix, DataFrame) else None, None)
|
101
|
-
|
102
|
-
@requires(stages=["embeddings"])
|
103
|
-
@workflow(input_arg="matrix", input_process="_embeddings", output_process="_end", next_stage="end")
|
104
|
-
def to_dataframe(self, *, matrix: MatrixRepresentation, **kwargs) -> DataFrame:
|
105
|
-
"""
|
106
|
-
Creates a dataframe from a matrix representation.
|
107
|
-
|
108
|
-
:param matrix: An matrix represenation from which to create a dataframe.
|
109
|
-
:param kwargs: Additional keyword arguments to pass to sklearn's pandas' DataFrame.
|
110
|
-
"""
|
111
|
-
|
112
|
-
return matrix_to_dataframe(matrix=matrix, **kwargs)
|