dstklib 1.0.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. dstk/__init__.py +10 -12
  2. dstk/adaptors/__init__.py +2 -0
  3. dstk/adaptors/adaptors.py +91 -0
  4. dstk/adaptors/typeguards.py +141 -0
  5. dstk/hooks/__init__.py +2 -0
  6. dstk/hooks/hook_tools.py +89 -0
  7. dstk/hooks/type_conversion.py +40 -0
  8. dstk/lib_types/__init__.py +2 -3
  9. dstk/lib_types/dstk_types.py +188 -16
  10. dstk/lib_types/plotly_types.py +1 -0
  11. dstk/method_index.py +32 -0
  12. dstk/models/__init__.py +2 -0
  13. dstk/models/model_tools.py +83 -0
  14. dstk/models/models.py +191 -0
  15. dstk/modules/__init__.py +10 -0
  16. dstk/modules/count_models.py +91 -0
  17. dstk/modules/data_visualization/__init__.py +2 -0
  18. dstk/modules/data_visualization/clustering.py +129 -0
  19. dstk/modules/data_visualization/embeddings.py +101 -0
  20. dstk/modules/geometric_distance.py +114 -0
  21. dstk/modules/ngrams.py +156 -0
  22. dstk/modules/predict_models.py +109 -0
  23. dstk/modules/text_matrix_builder.py +55 -0
  24. dstk/modules/text_processor.py +100 -0
  25. dstk/modules/tokenizer.py +139 -0
  26. dstk/modules/weight_matrix.py +65 -0
  27. dstk/templates/__init__.py +2 -0
  28. dstk/templates/rules.py +59 -0
  29. dstk/templates/templates.py +231 -0
  30. dstk/workflows/__init__.py +2 -0
  31. dstk/workflows/stage_workflows.py +55 -0
  32. dstk/workflows/workflow_tools.py +383 -0
  33. dstklib-2.0.0.dist-info/METADATA +377 -0
  34. dstklib-2.0.0.dist-info/RECORD +43 -0
  35. dstk/collocations.py +0 -121
  36. dstk/count_models.py +0 -112
  37. dstk/geometric_distance.py +0 -107
  38. dstk/lib_types/matplotlib_types.py +0 -4
  39. dstk/lib_types/nltk_types.py +0 -1
  40. dstk/matrix_base.py +0 -113
  41. dstk/pipeline_tools.py +0 -27
  42. dstk/pipelines.py +0 -114
  43. dstk/plot_embeddings.py +0 -240
  44. dstk/predict_models.py +0 -189
  45. dstk/text_matrix_builder.py +0 -87
  46. dstk/text_processor.py +0 -450
  47. dstk/weight_matrix.py +0 -71
  48. dstk/workflow_tools.py +0 -257
  49. dstklib-1.0.1.dist-info/METADATA +0 -360
  50. dstklib-1.0.1.dist-info/RECORD +0 -28
  51. {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/LICENSE +0 -0
  52. {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/WHEEL +0 -0
  53. {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,377 @@
1
+ Metadata-Version: 2.1
2
+ Name: dstklib
3
+ Version: 2.0.0
4
+ Requires-Python: <3.12
5
+ Description-Content-Type: text/markdown
6
+ License-File: LICENSE
7
+ Requires-Dist: spacy
8
+ Requires-Dist: plotly
9
+ Requires-Dist: scikit-learn
10
+ Requires-Dist: pandas
11
+ Requires-Dist: numpy
12
+ Requires-Dist: gensim
13
+ Requires-Dist: fasttext
14
+ Requires-Dist: kneed
15
+ Requires-Dist: umap-learn
16
+
17
+ # Distributional Semantics Toolkit
18
+
19
+ This library is based on the book *Distributional Semantics* by Alessandro Lenci and Magnus Sahlgren. It attempts to incorporate some of the algorithms described in the book, commonly used in distributional semantics.
20
+
21
+ ## Table of Contents
22
+
23
+ 1. [Documentation](#documentation)
24
+ 2. [Installation](#installation)
25
+ 3. [Usage](#usage)
26
+ 5. [Contributing](#contributing)
27
+ 6. [License](#license)
28
+
29
+ ## Documentation
30
+
31
+ You can find the (temporal) basic documentation [here](https://distributional-semantics-toolkit-30269b.gitlab.io/index.html).
32
+
33
+ ## Installation
34
+
35
+ To install it just run the command:
36
+
37
+ ```bash
38
+ pip install dstklib
39
+ ```
40
+
41
+ DSTK requires python <3.12 to work.
42
+
43
+ # Usage
44
+
45
+ The library can be used in three modes:
46
+
47
+ ## Standalone mode
48
+
49
+ In standalone mode you can use the methods individually. Just import the method you want to use from its repective module in the `dstk.modules` folder:
50
+
51
+ ```python
52
+ from dstk.modules.text_processor import to_lower
53
+
54
+ words = ["The", "Quick", "Brown", "Fox", "Jumps", "Over", "The", "Lazy", "Dog"]
55
+
56
+ lower_tokens = to_lower(words=words)
57
+
58
+ print(lower_tokens)
59
+
60
+ # Output: ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
61
+ ```
62
+
63
+ ## Workflow mode
64
+
65
+ If there is a specific workflow you use multiple times, you can automate it by using WorkflowBuilder. Just input the name of the methods (in the correct order) you want to use and their corresponding arguments as a list of dictionaries, along with the name of the module you are importing them from:
66
+
67
+ ```python
68
+ from dstk.workflows.workflow_tools import WorkflowBuilder
69
+
70
+ text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
71
+ model = "my_spacy_model"
72
+
73
+ TokenizerWorkflow = WorkflowBuilder(
74
+ name="TokenizerWorkflow",
75
+ module_name="tokenizer"
76
+ workflow=[
77
+ {"apply_model": {"model": model}},
78
+ {"get_tokens": {}},
79
+ {"remove_stop_words": {}}
80
+ ]
81
+ )
82
+
83
+ # Pass as an argument the input data:
84
+ tokens = TokenizerWorkflow(input_data=text)
85
+
86
+ print(tokens)
87
+
88
+ # Output: [quick, brown, fox, jumps, lazy, dog, sun, sets, behind, hills]
89
+ ```
90
+
91
+ You can also get specic results in the workflow or even all of them by using `return_methods` and `return all`:
92
+
93
+ ```python
94
+ from dstk.workflows.workflow_tools import WorkflowBuilder
95
+
96
+ text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
97
+ model = "my_spacy_model"
98
+
99
+ TokenizerWorkflow = WorkflowBuilder(
100
+ name="TokenizerWorkflow",
101
+ module_name="tokenizer"
102
+ workflow=[
103
+ {"apply_model": {"model": model}},
104
+ {"get_tokens": {}},
105
+ {"remove_stop_words": {}}
106
+ ]
107
+ )
108
+
109
+ model, filtered_tokens = TokenizerWorkflow(
110
+ input_data=text,
111
+ return_methods=["apply_model", "remove_stop_words"]
112
+ )
113
+
114
+ print(model)
115
+ print(filtered_tokens)
116
+
117
+ # Output:
118
+ # The quick brown fox jumps over the lazy dog while the sun sets behind the hills.
119
+ # [quick, brown, fox, jumps, lazy, dog, sun, sets, behind, hills]
120
+ ```
121
+
122
+ If you choose to return all of the results, the workflow will return a generator with a tuple containing the name of the method and its result:
123
+
124
+ ```python
125
+ from dstk.workflows.workflow_tools import WorkflowBuilder
126
+
127
+ text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
128
+ model = "my_spacy_model"
129
+
130
+ TokenizerWorkflow = WorkflowBuilder(
131
+ name="TokenizerWorkflow",
132
+ module_name="tokenizer"
133
+ workflow=[
134
+ {"apply_model": {"model": model}},
135
+ {"get_tokens": {}},
136
+ {"remove_stop_words": {}}
137
+ ]
138
+ )
139
+
140
+ result = TokenizerWorkflow(
141
+ input_data=text,
142
+ return_all=True
143
+ )
144
+
145
+ print(next(result))
146
+ print(next(result))
147
+ print(next(result))
148
+
149
+ # Output:
150
+ # ("apply_model", The quick brown fox jumps over the lazy dog while the sun sets behind the hills.)
151
+ # ("get_tokens", [The, quick, brown, fox, jumps, over, the, lazy, dog, while, the, sun, sets, behind, the, hills])
152
+ # ("remove_stop_words", [quick, brown, fox, jumps, lazy, dog, sun, sets, behind, hills])
153
+ ```
154
+
155
+ You can also make a workflow return a Wrapper class containing methods you might want to used multiple times. For example, if you wish to calculat the `cos_similarity` of different words in the same embeddings, you can do:
156
+
157
+ ```python
158
+ from dstk.workflows.workflow_tools import WorkflowBuilder
159
+
160
+ GeometricDistance = WorkflowBuilder(
161
+ name="GeometricDistance",
162
+ module_name="geometric_distance",
163
+ workflow=[
164
+ {"cos_similarity": {}}, # The methods should NOT have args
165
+ ],
166
+ wrapper=True
167
+ )
168
+
169
+ result = GeometricDistance(input_data=embeddings) # In this example 'embeddings' is a pandas DataFrame containing word embeddings
170
+
171
+ print(result.cos_similarity(first_word="word1", second_word="word2"))
172
+ print(result.cos_similarity(first_word="word3", second_word="word4"))
173
+
174
+ # Output:
175
+ # 0.999999
176
+ # 0.854456
177
+ ```
178
+
179
+ ### Templates
180
+
181
+ Workflows can use templates. Templates are a way to enforce certain rules on the workflow, such as method invocation order or restricting certain methods after others have been called, in order to achieve type safety and minimize errors. To use them just import the template of your respective module and pass it during workflow definition:
182
+
183
+ ```python
184
+ from dstk.workflows.workflow_tools import WorkflowBuilder
185
+ from dstk.templates import TokenizerTemplate
186
+
187
+ text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
188
+ model = "my_spacy_model"
189
+
190
+ TokenizerWorkflow = WorkflowBuilder(
191
+ name="TokenizerWorkflow",
192
+ module_name="tokenizer",
193
+ template=TokenizerTemplate,
194
+ workflow=[
195
+ {"get_tokens": {}},
196
+ {"apply_model": {"model": model}}, # Wrong order. 'apply_model' should go first
197
+ {"remove_stop_words": {}}
198
+ ]
199
+ )
200
+
201
+ tokens = TokenizerWorkflow(input_data=text)
202
+
203
+ # Output: RuntimeError: The method on step select_model must be ['apply_model']. Instead, got method get_tokens
204
+ ```
205
+
206
+ ### Stage workflows
207
+
208
+ Some common tasks in distributional semantics require the use of more than one module, such as text pre-processing (tokenization, text procesing and ngram extraction) or plotting embeddings (clustering and drawing the plot). Stage workflows exist to address that problem. They come with predefined templates about which modules should be used, their order and the methods that can be used (or not) after certain choices have been made. In order to use them just import the desired stage workflow and pass a StageWorkflow, which is a dictionary that includes the name of the modules to be used and its respective workflows:
209
+
210
+ ```python
211
+ from dstk.workflows.workflow_tools import WorkflowBuilder
212
+
213
+ text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
214
+ model = "my_spacy_model"
215
+
216
+ TextProcessorWorkflow = TextProcessing(
217
+ name="TextProcessorWorkflow",
218
+ workflows={
219
+ "tokenizer": [
220
+ {"apply_model": {"model": model}},
221
+ {"get_tokens": {}},
222
+ {"remove_stop_words": {"custom_stop_words": stops}},
223
+ ],
224
+ "ngrams": [
225
+ {"extract_ngrams": {"window_size": 3}}
226
+ ],
227
+ "text_processor": [
228
+ {"tokens_to_text": {"lemmatize": True}},
229
+ {"to_lower": {}},
230
+ {"join": {}}
231
+ ]
232
+
233
+ }
234
+ )
235
+
236
+ tokens = TokenizerWorkflow(input_data=text)
237
+
238
+
239
+ # Output: ['the quick brown', 'quick brown fox', 'brown fox jumps', 'fox jumps over', 'jumps over the', 'over the lazy', 'the lazy dog', 'lazy dog while', 'dog while the', 'while the sun', 'the sun set', 'sun set behind', 'set behind the', 'behind the hills']
240
+ ```
241
+
242
+ Just like in the case of WorkflowBuilder, you can also return the result of specific modules or all of them. You just need to pass a list with the names of the modules you want to return to `return_modules` or `True` to `return_all`. In the last case, it will return a generator with tuples containing the name of the module and its respective result.
243
+
244
+ ## Models
245
+
246
+ A model is just a set of workflows running one after another. If there are a lot of workflows that you constantly use, you can automate the process by using ModelBuilder. Just pass your workflows as a list:
247
+
248
+ ```python
249
+ from dstk.workflows.workflow_tools import WorkflowBuilder
250
+ from dstk.models.model_tools import ModelBuilder
251
+
252
+ text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
253
+
254
+ TokenizerWorkflow = WorkflowBuilder(
255
+ name="TokenizerWorkflow",
256
+ module_name="tokenizer",
257
+ workflow=[
258
+ {"apply_model": {"model": model}},
259
+ {"get_tokens": {}},
260
+ {"remove_stop_words": {}}
261
+ ]
262
+ )
263
+
264
+ TextWorkflow = WorkflowBuilder(
265
+ name="TextWorkflow",
266
+ module_name="text_processor",
267
+ workflow=[
268
+ {"tokens_to_text": {}}
269
+ {"to_lower": {}}
270
+ ]
271
+ )
272
+
273
+ CustomModel = ModelBuilder(
274
+ workflows=[
275
+ TokenizerWorkflow,
276
+ TextWorkflow
277
+ ]
278
+ )
279
+
280
+ # Pass as an argument your input data
281
+ result = CustomModel(input_data=text)
282
+
283
+ print(result)
284
+
285
+ # Output: ["quick", "brown", "fox", "jumps", "lazy", "dog", "sun", "sets", "behind", "hills"]
286
+ ```
287
+
288
+ Just like in the case of WorkflowBuilder and StageWorkflows, you can also return the result of specific workflows or all of them. You just need to pass a list with the names of the workflows (or hooks) you want to return to `return_workflows` or `True` to `return_all`. In the last case, it will return a generator with tuples containing the name of the workflows and its respective result.
289
+
290
+ ### Hooks
291
+
292
+ You can add hooks (functions with custom logic) to a pipeline. You must only follow two rules:
293
+
294
+ 1. The function must accept only one input and return one output
295
+ 2. The type of its input must be the same as the one returned from the previous workflow. Also, the type it returns must match the input of the next workflow.
296
+
297
+ Following these rules you can insert your custom hooks this way:
298
+
299
+ ```python
300
+ from dstk.workflows.workflow_tools import WorkflowBuilder
301
+ from dstk.models.model_tools import ModelBuilder
302
+ from dstk.hooks.hook_tools import Hook
303
+
304
+ text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
305
+
306
+ TokenizerWorkflow = WorkflowBuilder(
307
+ name="TokenizerWorkflow",
308
+ module_name="tokenizer",
309
+ workflow=[
310
+ {"apply_model": {"model": model}},
311
+ {"get_tokens": {}},
312
+ {"remove_stop_words": {}}
313
+ ]
314
+ )
315
+
316
+ TextWorkflow = WorkflowBuilder(
317
+ name="TextWorkflow",
318
+ module_name="text_processor",
319
+ workflow=[
320
+ {"tokens_to_text": {}}
321
+ {"to_lower": {}}
322
+ ]
323
+ )
324
+
325
+ def custom_hook(words):
326
+ return [word + "_hook" for word in words]
327
+
328
+ CustomHook = Hook(name="CustomHook", method=custom_hook)
329
+
330
+ CustomModel = ModelBuilder(
331
+ workflows=[
332
+ TokenizerWorkflow,
333
+ TextWorkflow,
334
+ CustomHook
335
+ ]
336
+ )
337
+
338
+ result = CustomPipeline(input_data=text)
339
+
340
+ print(result)
341
+
342
+ # Output: ["quick_hook", "brown_hook", "fox_hook", "jumps_hook", "lazy_hook", "dog_hook", "sun_hook", "sets_hook", "behind_hook", "hills_hook"]
343
+ ```
344
+
345
+ ### Predefined Models
346
+
347
+ DSTK has some models included that already cover most of the frequent tasks in distributional semantics:
348
+
349
+ - *StandardModel*: This pipeline generates word embeddings using the standard model as defined by (Lenci & Sahlgren 97). It preprocesses the text by removing stop words, lowering the words and segmenting the text using a context window. The co-occurrence matrix is weighted with PPMI and reduced with truncated SVD.
350
+
351
+ - *SGNSModel*: This pipeline generates word embeddings using Skip-Gram with Negative Sampling (SGNS) as defined by (Lenci & Sahlgren 162). It preprocesses the text by extracting the sentences, removing stop words and lowering them. The embeddings are extracted by using word2vec to do SGNS. Returns an instance of PredictModels.
352
+
353
+ In order to use them just do:
354
+
355
+ ```python
356
+ from dstk.models.models import StandardModel
357
+
358
+ text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
359
+
360
+ result = StandardModel(text=text, model=model, n_components=13)
361
+
362
+ print(result.cos_similarity(first_word="brown", second_word="fox")) # It returns a wrapper with the methods 'cos_similarity' and 'nearest_neighbors' included.
363
+
364
+ # Output: 0.5858588775335568
365
+ ```
366
+
367
+ # Contributing
368
+
369
+ I welcome contributions to improve this toolkit. If you have ideas or fixes, feel free to fork the repository and submit a pull request. Here are some ways you can help:
370
+
371
+ * Report bugs or issues.
372
+
373
+ * Suggest new features or algorithms to add.
374
+
375
+ # License
376
+
377
+ This project is licensed under the GPL-3 License - see the [LICENSE](https://gitlab.com/CesarACabrera/distributional-semantics-toolkit/-/blob/master/LICENSE?ref_type=heads) file for details.
@@ -0,0 +1,43 @@
1
+ dstk/__init__.py,sha256=zxRGhDnwApMqeLfJ3-ljlliJl3Nve_f2JYOXVw5hx_A,168
2
+ dstk/method_index.py,sha256=7UoUPn_PZ3U10WkXFciqEp_XN5xEy4Wy4Wyxe6a__WQ,791
3
+ dstk/adaptors/__init__.py,sha256=b6asMMPwGZlgFdvrjwlTqbkItB00_4FFdh63-tRMQjs,49
4
+ dstk/adaptors/adaptors.py,sha256=t-BbxHYtHyB9-hxtbQTcMLrYmZ_Kov4WwSoGcilZMmE,4057
5
+ dstk/adaptors/typeguards.py,sha256=Ev5Z_dppKRlgNVho07QMfCkmxY800uL2auxrfGRubd4,4589
6
+ dstk/hooks/__init__.py,sha256=Z8rT2WV-hlhbEUGrPfO71sdpWPf69qu8tSgF6MEhVZo,56
7
+ dstk/hooks/hook_tools.py,sha256=H0pJ5ADJeLtGTacNK39WABO5Hr65HfhTFLzFjtI-3R0,3275
8
+ dstk/hooks/type_conversion.py,sha256=OKyAiCiOGecboy5FGLT43HmnltpAYtDY_LpOPcx5S2k,1395
9
+ dstk/lib_types/__init__.py,sha256=L64qlAbqNavw6fopO467i6r0jOJGSL1d0VsGm35lPWU,222
10
+ dstk/lib_types/dstk_types.py,sha256=vqqvec3eSKO8ozg-aDM8aiWWPXt0qQxuhGXiZzm2RHE,6750
11
+ dstk/lib_types/fasttext_types.py,sha256=5LXE77kgCPJHRx0zXlLTs7wRIQOGZiz30Pq0trIXcBA,51
12
+ dstk/lib_types/gensim_types.py,sha256=tg3OASG_EWuqFQw_pKM4HNjRk1yrMnmlBqdKm-orxag,34
13
+ dstk/lib_types/numpy_types.py,sha256=zxgVrHcRJ-_NGO3LE1aba0d4JQDLYN26us5ljlhIq7E,64
14
+ dstk/lib_types/pandas_types.py,sha256=bR27h-xyZ3FccROIHxqYpVvqMNoi1bvIzpq25cf8kkg,43
15
+ dstk/lib_types/plotly_types.py,sha256=44_qvw7RNhYqq-BvdBALGR_sWTwQRSip7HjM--fkPGQ,39
16
+ dstk/lib_types/sklearn_types.py,sha256=W59yIEkZM_E_tW061x1bY-LpRC2aCzLgtYmXANNSN3Q,47
17
+ dstk/lib_types/spacy_types.py,sha256=hUiaw4AywSW8o42h5lp3t6a4yosG_GasdJX2RCKgW7o,125
18
+ dstk/models/__init__.py,sha256=27_6lGFl0BAZp3rgXnbazEX_uxK9jQ0lf3RkjomzICA,48
19
+ dstk/models/model_tools.py,sha256=IYTG4I1oNhnW3JX4bzXxgBa8phhvO7l0haTxeVMeYhM,3279
20
+ dstk/models/models.py,sha256=wLd_BOP4hxWnCg7PCDOO-ZZ3Yx8jY3MqTLbvbhQqx2M,8152
21
+ dstk/modules/__init__.py,sha256=NKI_C2D86x25lUmcM2r6e0M31rV1-8FZBFHRP2oXF2Q,267
22
+ dstk/modules/count_models.py,sha256=VY8YRao78pmh_3WBX8zWQ94B13J1cSL6Pg91c3qUUyg,4288
23
+ dstk/modules/geometric_distance.py,sha256=Zj9O0p6glJOqcNGkdjP_qucj_en-QeejbbQ8Hr5oUVM,4621
24
+ dstk/modules/ngrams.py,sha256=WZ8Nr05KeBnOH_AnPLfUrBQJkcz1hJxJRwVuIU_qviI,6730
25
+ dstk/modules/predict_models.py,sha256=6Wz75nG7JqtI4fCvidDrYQmEEuQyZ3aQxLv66zq0mI4,3987
26
+ dstk/modules/text_matrix_builder.py,sha256=jAla8fKiKZ8cPHUmdW9V0B1MRGU488JKl4s9h45zLBc,2318
27
+ dstk/modules/text_processor.py,sha256=v-T8qPQwRgXIeZ5eJxRO3e7J-zD6UPzbpb-sRf94Vrs,3600
28
+ dstk/modules/tokenizer.py,sha256=bkALslqlUHzD83tjsI57U0joUIVRLZCdUlFi5TPaMb8,4323
29
+ dstk/modules/weight_matrix.py,sha256=-nP-2VY2YFjOoExeojajWA0fkxtj8GdgJU5QQHBQFyo,2694
30
+ dstk/modules/data_visualization/__init__.py,sha256=OSogy0-_DE7t89KS8Q6n1RNGToqoq2LnmKfLc6cjGOA,52
31
+ dstk/modules/data_visualization/clustering.py,sha256=tyUQtFayH7Jh4uisheaeWkIEn9X3fvBYT-sblfgpVws,4982
32
+ dstk/modules/data_visualization/embeddings.py,sha256=8lPMUB-saog0c3PWiHvuz1sC5YYkqFjn5wCJilnyBfQ,4237
33
+ dstk/templates/__init__.py,sha256=t7R-RKxq-EZukHV7U9zj7dbIrbxGKTIJyRzh-Tm1PiM,45
34
+ dstk/templates/rules.py,sha256=Rg8uUBwWlG0b6P-ObnahImb-ZOphMkXB5COWLRQgZSg,1757
35
+ dstk/templates/templates.py,sha256=i-RV0FJfGpeh1Ja8PxLJetYh2cXH7l-jsV-bmJcdOIU,6447
36
+ dstk/workflows/__init__.py,sha256=sP8X_nwRIquC4NHSyUZ5L8DbxZfUCtD8qBekykcBuW8,61
37
+ dstk/workflows/stage_workflows.py,sha256=LkNRgEHRLmpCNIXJJiUERu45trS3zGv2zC7hcIxUl_s,2639
38
+ dstk/workflows/workflow_tools.py,sha256=M1wcJ5UGeEKf6TSE0-Z2AA2CetmFUrgCGdV5Z4LvD4Y,16101
39
+ dstklib-2.0.0.dist-info/LICENSE,sha256=LpSgNPBfwn5F4CVhnTbhpiX2f0YgRMzGWQ7Sphuuwuc,35139
40
+ dstklib-2.0.0.dist-info/METADATA,sha256=gF35KRqRuMZTZRBJ_05rq8eUwameiBjX2sEafpi4WBU,12863
41
+ dstklib-2.0.0.dist-info/WHEEL,sha256=VyG4dJCdJcxE1baiVBm9NET3Nj7Wne1lZZq7UFNxRpg,97
42
+ dstklib-2.0.0.dist-info/top_level.txt,sha256=b_MNmKso0-ra2M7snsy5fZBW-l9MItjrwMYBd-tiOYo,5
43
+ dstklib-2.0.0.dist-info/RECORD,,
dstk/collocations.py DELETED
@@ -1,121 +0,0 @@
1
- from collections import Counter
2
- from dataclasses import dataclass
3
- import matplotlib.pyplot as plt
4
- from .workflow_tools import workflow, requires, WorkflowManager
5
-
6
- from .lib_types import BarContainer, Collocate
7
-
8
- STAGES = [
9
- "start", # Before any processing
10
- "collocates", # Manipulation of collocates
11
- "count", # Operations dealing with the counts of the words appearing around the target word
12
- "end" # End of the workflow. After this stage the user must necessarily call result to continue with the analysis
13
- ]
14
-
15
- @dataclass
16
- class Ngrams:
17
- collocates: list[tuple[str, ...]]
18
- bigrams: list[Collocate]
19
-
20
- class Collocations(WorkflowManager):
21
- """
22
- Extracts n-grams for a target word by a context window (both directed or undirected). Counts and plots the terms that co-occur with the target
23
-
24
- :param tokens: A list of tokenized words (e.g. from a text or corpus) in which to search for the target word and its collocates.
25
-
26
- """
27
-
28
- _start: list[str]
29
- _end: BarContainer
30
-
31
- def __init__(self, tokens: list[str] | None = None):
32
- """
33
- Initializes TargetCollocations with given attributes.
34
- """
35
-
36
- super().__init__()
37
-
38
- # Stages
39
-
40
- self._collocates: Ngrams
41
- self._count: Counter[str]
42
-
43
- self._set_workflow(input_arg=tokens)
44
-
45
- @requires(stages=["start"])
46
- @workflow(input_arg="tokens", input_process="_start", output_process="_collocates", next_stage="collocates")
47
- def extract_ngrams(self, *, tokens: list[str], target_word: str, window_size: tuple[int, int], directed: bool = False) -> Ngrams:
48
- """
49
- Extracts both the context words of the target collocation, returned as tuples whose lenght corresponds to the specified window_size, and the collocations of the target word, in either directed or undirected manner.
50
-
51
- :param tokens: A list of tokenized words (e.g. from a text or corpus) in which to search for the target word and its collocates.
52
- :param target_word: Target word whose collocation are to be identified.
53
- :param window_size: Context window represented as a tuple (left, right) of the number of words to be included to left and right of the target word.
54
- :param directed: If True, the position of collocates relative to the target word is considered (i.e., direction matters); if False, direction is ignored. Defaults to False.
55
- """
56
-
57
- collocates: list[tuple[str, ...]] = []
58
- bigrams: list[Collocate] = []
59
-
60
- for index, word in enumerate(tokens):
61
- if word == target_word:
62
- start: int = max(0, index - window_size[0])
63
- end: int = min(len(tokens), index + window_size[1] + 1)
64
-
65
- left_context: list[str] = tokens[start:index]
66
- right_context: list[str] = tokens[index + 1:end]
67
-
68
- context: list[str] = left_context + right_context
69
-
70
- collocates.append(tuple(context))
71
-
72
- if directed == True:
73
- bigrams.extend([(word, ("L", target_word)) for word in left_context] + [(word, ("R", target_word)) for word in right_context])
74
- else:
75
- bigrams.extend([(word, target_word) for word in context])
76
-
77
- return Ngrams(collocates, bigrams)
78
-
79
- @requires(stages=["collocates"])
80
- @workflow(input_arg="collocates", input_attrs={"collocates": "collocates"}, input_process="_collocates", output_process="_count", next_stage="count")
81
- def count_collocates(self, *, collocates: list[tuple[str, ...]]) -> Counter[str]:
82
- """
83
- Counts the collocates of the target word.
84
-
85
- :param collocates: A list of collocates to count.
86
- """
87
-
88
- all_words: list[str] = [word for collocation in collocates for word in collocation]
89
- word_counts: Counter[str] = Counter(all_words)
90
-
91
- return word_counts
92
-
93
- @requires(stages=["count"])
94
- @workflow(input_arg="word_counts", input_process="_count", output_process="_end", next_stage="end")
95
- def plot(self, *, word_counts: Counter[str], size: int = 10, show: bool = True, path: str | None = None) -> BarContainer:
96
- """
97
- Plots the count of the collocates.
98
-
99
- :param word_counts: A Counter object with the counts of each word. Defaults to None.
100
- :param size: The number of the most common collocates to plot. Defaults to 10.
101
- :param show: If True, shows the plot. Defaults to True.
102
- :param path: If provided, saves the plot in the specified path. Defaults to None.
103
- """
104
-
105
- counts: list[tuple[str, int]] = word_counts.most_common(size)
106
-
107
- words: tuple[str, ...]
108
- values: tuple[int, ...]
109
- words, values = zip(*counts)
110
-
111
- fig: BarContainer = plt.bar(words, values)
112
- plt.xlabel("Palabras")
113
- plt.ylabel("Cuentas")
114
-
115
- if path:
116
- plt.savefig(path)
117
-
118
- if show:
119
- plt.show()
120
-
121
- return fig
dstk/count_models.py DELETED
@@ -1,112 +0,0 @@
1
- from sklearn.preprocessing import StandardScaler
2
- from sklearn.decomposition import PCA, TruncatedSVD
3
- from .workflow_tools import requires, workflow, WorkflowManager
4
- from .matrix_base import MatrixRepresentation, accept_matrix_representation, matrix_to_dataframe
5
-
6
- from .lib_types import ndarray, DataFrame
7
-
8
- STAGES = [
9
- "start", # Before any transformation to the Co-Matrixis applied
10
- "embeddings", # Result of the embeddings
11
- "end" # Embeddings transformed to dataframe
12
- ]
13
-
14
- class CountModels(WorkflowManager):
15
- """
16
- Generates word embeddings using dimensionality reduction techniques on a co-occurrence matrix, such as SVD and PCA.
17
-
18
- :param co_ocurrence_matrix: A Co-ocurrence matrix from which embeddings will be generated.
19
- """
20
-
21
- _start: ndarray | DataFrame
22
- _end: DataFrame
23
-
24
- def __init__(self, co_ocurrence_matrix: DataFrame | None = None):
25
- """
26
- Initializes CountModels with given attributes.
27
- """
28
- super().__init__()
29
-
30
- self._embeddings: MatrixRepresentation
31
-
32
- self._set_workflow(input_arg=co_ocurrence_matrix)
33
-
34
- @requires(stages=["start"])
35
- @workflow(input_arg="matrix", input_process="_start", output_process="_embeddings", next_stage="embeddings")
36
- @accept_matrix_representation()
37
- def scale_matrix(self, *, matrix: ndarray | DataFrame, **kwargs) -> MatrixRepresentation:
38
- """
39
- Scales the input matrix to have zero mean and unit variance for each feature.
40
-
41
- This method applies standardization using scikit-learn's StandardScaler, which transforms the data such that each colum (feature) has a mean of 0 and a standard deviation of 1.
42
-
43
- :param matrix: The input data to scale.
44
- :param kwargs: Additional keyword arguments to pass to sklearn's StandardScaler. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
45
-
46
- This method supports different matrix forms due to decorator-based preprocessing:
47
- - matrix: ndarray or Dataframe
48
- - matrix representation: MatrixRepresentation
49
- """
50
-
51
- scaler: StandardScaler = StandardScaler(**kwargs)
52
- scaled_matrix: ndarray = scaler.fit_transform(matrix)
53
-
54
- return MatrixRepresentation(scaled_matrix, matrix.index if isinstance(matrix, DataFrame) else None, matrix.columns if isinstance(matrix, DataFrame) else None)
55
-
56
- @requires(stages=["embeddings"])
57
- @workflow(input_arg="matrix", input_process="_embeddings", output_process="_embeddings")
58
- @accept_matrix_representation(override=("columns", None))
59
- def svd_embeddings(self, *, matrix: ndarray | DataFrame, **kwargs) -> MatrixRepresentation:
60
- """
61
- Generates word embeddings using truncated Single Value Descomposition (SVD).
62
-
63
- :param matrix: A Co-occurrence matrix from which embeddings will be generated.
64
- :param kwargs: Additional keyword arguments to pass to sklearn's TruncatedSVD.
65
- Common options include:
66
- - n_components: Specifies the number of dimensions to reduce the Co-ocurrence matrix to.
67
- For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
68
-
69
- This method supports different matrix forms due to decorator-based preprocessing:
70
- - matrix: ndarray or Dataframe
71
- - matrix representation: MatrixRepresentation
72
- """
73
-
74
- svd: TruncatedSVD = TruncatedSVD(**kwargs)
75
- embeddings: ndarray = svd.fit_transform(matrix)
76
-
77
- return MatrixRepresentation(embeddings, matrix.index if isinstance(matrix, DataFrame) else None, None)
78
-
79
- @requires(stages=["embeddings"])
80
- @workflow(input_arg="matrix", input_process="_embeddings", output_process="_embeddings")
81
- @accept_matrix_representation(override=("columns", None))
82
- def pca_embeddings(self, *, matrix: ndarray | DataFrame, **kwargs) -> MatrixRepresentation:
83
- """
84
- Generates word embeddings using Principal Component Analysis (PCA).
85
-
86
- :param matrix: A Co-occurrence matrix from which embeddings will be generated.
87
- :param kwargs: Additional keyword arguments to pass to sklearn's PCA.
88
- Common options include:
89
- - n_components: If an integer, specifies the number of dimensions to reduce the Co-ocurrence matrix to. If a float, the amount of variance to preserve during PCA.
90
- For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
91
-
92
- This method supports different matrix forms due to decorator-based preprocessing:
93
- - matrix: ndarray or Dataframe
94
- - matrix representation: MatrixRepresentation
95
- """
96
-
97
- pca: PCA = PCA(**kwargs)
98
- embeddings: ndarray = pca.fit_transform(matrix)
99
-
100
- return MatrixRepresentation(embeddings, matrix.index if isinstance(matrix, DataFrame) else None, None)
101
-
102
- @requires(stages=["embeddings"])
103
- @workflow(input_arg="matrix", input_process="_embeddings", output_process="_end", next_stage="end")
104
- def to_dataframe(self, *, matrix: MatrixRepresentation, **kwargs) -> DataFrame:
105
- """
106
- Creates a dataframe from a matrix representation.
107
-
108
- :param matrix: An matrix represenation from which to create a dataframe.
109
- :param kwargs: Additional keyword arguments to pass to sklearn's pandas' DataFrame.
110
- """
111
-
112
- return matrix_to_dataframe(matrix=matrix, **kwargs)