dstklib 1.0.2__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstk/__init__.py +10 -12
- dstk/adaptors/__init__.py +2 -0
- dstk/adaptors/adaptors.py +91 -0
- dstk/adaptors/typeguards.py +141 -0
- dstk/hooks/__init__.py +2 -0
- dstk/hooks/hook_tools.py +89 -0
- dstk/hooks/type_conversion.py +40 -0
- dstk/lib_types/__init__.py +2 -3
- dstk/lib_types/dstk_types.py +188 -16
- dstk/lib_types/plotly_types.py +1 -0
- dstk/method_index.py +32 -0
- dstk/models/__init__.py +2 -0
- dstk/models/model_tools.py +83 -0
- dstk/models/models.py +191 -0
- dstk/modules/__init__.py +10 -0
- dstk/modules/count_models.py +91 -0
- dstk/modules/data_visualization/__init__.py +2 -0
- dstk/modules/data_visualization/clustering.py +129 -0
- dstk/modules/data_visualization/embeddings.py +101 -0
- dstk/modules/geometric_distance.py +114 -0
- dstk/modules/ngrams.py +156 -0
- dstk/modules/predict_models.py +109 -0
- dstk/modules/text_matrix_builder.py +55 -0
- dstk/modules/text_processor.py +100 -0
- dstk/modules/tokenizer.py +139 -0
- dstk/modules/weight_matrix.py +65 -0
- dstk/templates/__init__.py +2 -0
- dstk/templates/rules.py +59 -0
- dstk/templates/templates.py +231 -0
- dstk/workflows/__init__.py +2 -0
- dstk/workflows/stage_workflows.py +55 -0
- dstk/workflows/workflow_tools.py +383 -0
- dstklib-2.0.0.dist-info/METADATA +377 -0
- dstklib-2.0.0.dist-info/RECORD +43 -0
- dstk/collocations.py +0 -121
- dstk/count_models.py +0 -112
- dstk/geometric_distance.py +0 -107
- dstk/lib_types/matplotlib_types.py +0 -4
- dstk/lib_types/nltk_types.py +0 -1
- dstk/matrix_base.py +0 -113
- dstk/pipeline_tools.py +0 -27
- dstk/pipelines.py +0 -114
- dstk/plot_embeddings.py +0 -240
- dstk/predict_models.py +0 -189
- dstk/text_matrix_builder.py +0 -87
- dstk/text_processor.py +0 -450
- dstk/weight_matrix.py +0 -71
- dstk/workflow_tools.py +0 -257
- dstklib-1.0.2.dist-info/METADATA +0 -369
- dstklib-1.0.2.dist-info/RECORD +0 -28
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/LICENSE +0 -0
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/WHEEL +0 -0
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/top_level.txt +0 -0
dstk/workflow_tools.py
DELETED
@@ -1,257 +0,0 @@
|
|
1
|
-
from functools import wraps
|
2
|
-
import warnings
|
3
|
-
import inspect
|
4
|
-
from copy import deepcopy
|
5
|
-
|
6
|
-
from .lib_types.dstk_types import Function, MethodSpec
|
7
|
-
from inspect import Signature, BoundArguments
|
8
|
-
from typing import Any, cast, Callable, Type, TypeAlias, TypeGuard
|
9
|
-
|
10
|
-
class WorkflowManager:
|
11
|
-
"""
|
12
|
-
Manages the execution of processing methods in workflow mode.
|
13
|
-
|
14
|
-
Tracks workflow state, controls stage transitions, and stores intermediate
|
15
|
-
results for chained method execution with enforced sequencing and unit context.
|
16
|
-
"""
|
17
|
-
|
18
|
-
def __init__(self) -> None:
|
19
|
-
"""
|
20
|
-
Initializes WorkflowManager with given attributes.
|
21
|
-
"""
|
22
|
-
|
23
|
-
self._flow: bool
|
24
|
-
self._current_stage: str
|
25
|
-
self._processing_unit: str
|
26
|
-
|
27
|
-
self._stages: list[str]
|
28
|
-
|
29
|
-
self._start: Any
|
30
|
-
self._end: Any
|
31
|
-
|
32
|
-
self._called_methods: list[str] = []
|
33
|
-
|
34
|
-
def _set_workflow(self, input_arg: Any) -> None:
|
35
|
-
"""
|
36
|
-
Initializes workflow mode based on the presence of input arguments.
|
37
|
-
|
38
|
-
Sets workflow state and starting point if all required inputs are provided.
|
39
|
-
Issues a warning and disables workflow mode if inputs are partially missing.
|
40
|
-
|
41
|
-
:param input_args: A dictionary of input argument names and their values. If all values are non-None, workflow mode is activated.
|
42
|
-
:param input_source: The initial data source to store when starting the workflow.
|
43
|
-
"""
|
44
|
-
|
45
|
-
if input_arg is not None:
|
46
|
-
self._start = input_arg
|
47
|
-
self._current_stage = "start"
|
48
|
-
self._flow = True
|
49
|
-
else:
|
50
|
-
self._flow = False
|
51
|
-
|
52
|
-
@property
|
53
|
-
def result(self) -> Any:
|
54
|
-
"""
|
55
|
-
Returns the current output of the processing workflow.
|
56
|
-
|
57
|
-
Use this property to retrieve the final result after a chain of workflow method calls. It safely copies the internal state (if possible) to prevent side effects.
|
58
|
-
|
59
|
-
:return: The result of the most recent workflow stage.
|
60
|
-
"""
|
61
|
-
|
62
|
-
result: Any = getattr(self, f"_{self._current_stage}")
|
63
|
-
try:
|
64
|
-
copy: Any = deepcopy(result)
|
65
|
-
return copy
|
66
|
-
except:
|
67
|
-
return result
|
68
|
-
|
69
|
-
|
70
|
-
class WorkflowBuilder:
|
71
|
-
"""
|
72
|
-
Automates the execution of a sequence of methods on a WorkflowManager subclass.
|
73
|
-
|
74
|
-
:param work_class: A subclass of WorkflowManager representing the workflow to execute.
|
75
|
-
:param method_representation: A dictionary mapping method names to their keyword arguments.
|
76
|
-
:param result: If True, returns the result of the workflow. Else, returns the instance of the working class. Defaults to True.
|
77
|
-
"""
|
78
|
-
|
79
|
-
def __init__(self, work_class: Type[WorkflowManager], method_representation: MethodSpec, result: bool = True):
|
80
|
-
"""
|
81
|
-
Initializes WorkflowBuilder with given attributes.
|
82
|
-
"""
|
83
|
-
|
84
|
-
self.work_class: type = work_class
|
85
|
-
self.methods: MethodSpec = method_representation
|
86
|
-
self.result: bool = result
|
87
|
-
|
88
|
-
def __call__(self, *args, **kwargs) -> Any:
|
89
|
-
workflow: Type[WorkflowManager] = self.work_class(*args, **kwargs)
|
90
|
-
|
91
|
-
for key, value in self.methods.items():
|
92
|
-
method: Callable = getattr(workflow, key)
|
93
|
-
method(**value)
|
94
|
-
|
95
|
-
return workflow.result if self.result else workflow
|
96
|
-
|
97
|
-
|
98
|
-
def workflow(input_arg: str, input_process: str, output_process: str, input_attrs: dict[str, Any] | None = None, next_stage: str | None = None, set_unit: str | None = None,) -> Callable[[Function], Function]:
|
99
|
-
"""
|
100
|
-
Enables workflow execution for a method by automatically injecting inputs,
|
101
|
-
storing outputs, and transitioning stages when in workflow mode.
|
102
|
-
|
103
|
-
:param input_arg: Name of the keyword argument to inject into the method.
|
104
|
-
:param input_process: Attribute name to retrieve the input data from if not provided.
|
105
|
-
:param output_process: Attribute name to store the method's output in workflow mode.
|
106
|
-
:param input_attrs: Optional mapping of argument names to extract from input data. Supports str for attribute access or nested dicts for deep lookup. Defaults to None.
|
107
|
-
:param next_stage: Optional name of the next workflow stage to transition to after method execution. Defaults to None.
|
108
|
-
:param set_unit: Optional name of the processing unit to set for the next workflow step. Defaults to None.
|
109
|
-
|
110
|
-
:return: A decorator that wraps the method with workflow logic.
|
111
|
-
:raises ValueError: If the user is not in workflow mode and he did not passed thte input arg. Also, if the value of input_attrs is different from None, str or dict.
|
112
|
-
"""
|
113
|
-
|
114
|
-
def decorator(method: Function) -> Function:
|
115
|
-
@wraps(method)
|
116
|
-
def wrapper(self, *args, **kwargs) -> Any:
|
117
|
-
method_name: str = method.__name__
|
118
|
-
|
119
|
-
if input_arg not in kwargs:
|
120
|
-
if not self._flow:
|
121
|
-
raise ValueError(f"{input_arg} must be provided if not using workflow mode")
|
122
|
-
|
123
|
-
input_data: Any = getattr(self, input_process)
|
124
|
-
|
125
|
-
if input_attrs:
|
126
|
-
for key, value in input_attrs.items():
|
127
|
-
if value is None:
|
128
|
-
kwargs[key] = input_data
|
129
|
-
elif isinstance(value, str):
|
130
|
-
kwargs[key] = getattr(input_data, value)
|
131
|
-
elif isinstance(value, dict):
|
132
|
-
for attr_name, subattr in value.items():
|
133
|
-
mapping = getattr(input_data, attr_name)[subattr]
|
134
|
-
kwargs[key] = mapping
|
135
|
-
else:
|
136
|
-
raise ValueError(f"Type {type(value)} of value bot recognized")
|
137
|
-
else:
|
138
|
-
kwargs[input_arg] = input_data
|
139
|
-
|
140
|
-
result: Any = method(self, *args, **kwargs)
|
141
|
-
|
142
|
-
if self._flow:
|
143
|
-
setattr(self, output_process, result)
|
144
|
-
if next_stage:
|
145
|
-
self._current_stage = next_stage
|
146
|
-
if set_unit:
|
147
|
-
self._processing_unit = set_unit
|
148
|
-
if next_stage == "end":
|
149
|
-
warnings.warn(UserWarning(f"After calling method {method_name} you must necessarily call result to continue with analysis. Further chaining will result in error."))
|
150
|
-
|
151
|
-
self._called_methods.append(method_name)
|
152
|
-
return self
|
153
|
-
|
154
|
-
return result
|
155
|
-
|
156
|
-
return cast(Function, wrapper)
|
157
|
-
|
158
|
-
return decorator
|
159
|
-
|
160
|
-
def requires(stages: list[str], unit: str | None = None, multiple_calls: bool = False) -> Callable[[Function], Function]:
|
161
|
-
"""
|
162
|
-
Ensures a method is only callable in workflow mode at allowed stages and units,
|
163
|
-
and prevents repeated calls to the same method.
|
164
|
-
|
165
|
-
:param stages: A list of stages where the method is allowed to be used.
|
166
|
-
:param unit: The required processing unit; if specified, the method can only run when the current unit matches. Defaults to None.
|
167
|
-
|
168
|
-
:return: A decorator that enforces workflow constraints on the wrapped method.
|
169
|
-
"""
|
170
|
-
|
171
|
-
def decorator(method: Function) -> Function:
|
172
|
-
@wraps(method)
|
173
|
-
def wrapper(self, *args, **kwargs) -> Any:
|
174
|
-
method_name: str = method.__name__
|
175
|
-
|
176
|
-
if self._flow:
|
177
|
-
if method_name in self._called_methods and not multiple_calls:
|
178
|
-
raise RuntimeError(f"Method {method_name} already called. You can only call each method exactly once in workflow mode.")
|
179
|
-
if not hasattr(self, "_current_stage"):
|
180
|
-
raise RuntimeError("Current phase is not initialized.")
|
181
|
-
if self._current_stage not in stages:
|
182
|
-
raise RuntimeError(
|
183
|
-
f"Method '{method_name}' requires stages '{', '.join(stages)}' but current phase is '{self._current_stage}'"
|
184
|
-
)
|
185
|
-
if unit and unit != self._processing_unit:
|
186
|
-
raise RuntimeError(
|
187
|
-
f"Method '{method_name}' can only be used when the you are processing by {unit}, but you are currently processing by {self._processing_unit}."
|
188
|
-
)
|
189
|
-
|
190
|
-
return method(self, *args, **kwargs)
|
191
|
-
|
192
|
-
return cast(Function, wrapper)
|
193
|
-
|
194
|
-
return decorator
|
195
|
-
|
196
|
-
def accepts_generic(*, type_checker: Callable, input_arg: str, accepts: bool, intercept: bool, interceptor: Callable, input_type: TypeAlias, custom_error_message: str = "") -> Callable[[Function], Function]:
|
197
|
-
"""
|
198
|
-
A generic decorator factory that conditionally intercepts and transforms a method's input based on runtime type checks.
|
199
|
-
|
200
|
-
This utility enables creating flexible, reusable decorators (like `accepts_sentences` or `accepts_tags`) by delegating the interception logic to a custom `interceptor` and input validation to a `type_checker`.
|
201
|
-
|
202
|
-
If the input value (specified by `input_arg`) passes the `type_checker`, and both `accepts` and `intercept` are True, the `interceptor` is called instead of the original method. Otherwise, the original method is called directly. If the input is of the expected type but `accepts` is False, a `ValueError` is raised.
|
203
|
-
|
204
|
-
:param type_checker: Function that checks whether the input value is of the expected structure/type.
|
205
|
-
:param input_arg: The name of the argument to inspect in the decorated method.
|
206
|
-
:param accepts: Whether the method is allowed to handle this type of input.
|
207
|
-
:param intercept: Whether the input should be transformed before calling the method.
|
208
|
-
:param interceptor: Function that handles the interception logic, replacing the original method call when triggered.
|
209
|
-
:param input_type: A human-readable type description used in error messages and type casts. This is for documentation/static typing purposes only.
|
210
|
-
:param custom_error_message: Optional custom message to append to error if input is rejected.
|
211
|
-
|
212
|
-
:returns: Callable: A decorator that wraps the target method with conditional input handling logic.
|
213
|
-
|
214
|
-
:raises: ValueError: If the input matches the expected type but `accepts` is False.
|
215
|
-
"""
|
216
|
-
|
217
|
-
def decorator(method: Function) -> Function:
|
218
|
-
@wraps(method)
|
219
|
-
def wrapper(self, *args, **kwargs) -> Any:
|
220
|
-
signature: Signature = inspect.signature(method)
|
221
|
-
bound_args: BoundArguments = signature.bind(self, *args, **kwargs)
|
222
|
-
bound_args.apply_defaults()
|
223
|
-
|
224
|
-
input_value: Any = bound_args.arguments.get(input_arg, None)
|
225
|
-
|
226
|
-
if type_checker(input_value) and (accepts and intercept):
|
227
|
-
filtered_kwargs: dict[str, Any] = kwargs.copy()
|
228
|
-
filtered_kwargs.pop(input_arg, None)
|
229
|
-
|
230
|
-
return interceptor(self, input_value=input_value, method=method, *args, **filtered_kwargs)
|
231
|
-
elif not type_checker(input_value) or (accepts and not intercept):
|
232
|
-
return cast(input_type, method(self, *args, **kwargs))
|
233
|
-
else:
|
234
|
-
raise ValueError(f"Method {method.__name__} does not accept {input_type} as input. {custom_error_message}")
|
235
|
-
|
236
|
-
return cast(Function, wrapper)
|
237
|
-
return decorator
|
238
|
-
|
239
|
-
def is_method_spec(spec: Any) -> TypeGuard[MethodSpec]:
|
240
|
-
"""
|
241
|
-
If spec is of type MethodSpec, returns True. Else, returns False.
|
242
|
-
|
243
|
-
:param spec: A dict to check its type.
|
244
|
-
"""
|
245
|
-
|
246
|
-
return (
|
247
|
-
isinstance(spec, dict) and
|
248
|
-
all(
|
249
|
-
isinstance(method_name, str) and
|
250
|
-
isinstance(params, dict) and
|
251
|
-
all(
|
252
|
-
isinstance(key, str)
|
253
|
-
for key in params.keys()
|
254
|
-
)
|
255
|
-
for method_name, params in spec.items()
|
256
|
-
)
|
257
|
-
)
|
dstklib-1.0.2.dist-info/METADATA
DELETED
@@ -1,369 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: dstklib
|
3
|
-
Version: 1.0.2
|
4
|
-
Requires-Python: <3.12
|
5
|
-
Description-Content-Type: text/markdown
|
6
|
-
License-File: LICENSE
|
7
|
-
Requires-Dist: spacy
|
8
|
-
Requires-Dist: matplotlib
|
9
|
-
Requires-Dist: scikit-learn
|
10
|
-
Requires-Dist: pandas
|
11
|
-
Requires-Dist: numpy
|
12
|
-
Requires-Dist: gensim
|
13
|
-
Requires-Dist: fasttext
|
14
|
-
Requires-Dist: kneed
|
15
|
-
Requires-Dist: umap-learn
|
16
|
-
|
17
|
-
# Distributional Semantics Toolkit
|
18
|
-
|
19
|
-
This library is based on the book *Distributional Semantics* by Alessandro Lenci and Magnus Sahlgren. It attempts to incorporate some of the algorithms described in the book, commonly used in distributional semantics.
|
20
|
-
|
21
|
-
## Table of Contents
|
22
|
-
|
23
|
-
1. [Introduction](#introduction)
|
24
|
-
2. [Installation](#installation)
|
25
|
-
3. [Usage](#usage)
|
26
|
-
4. [Algorithms](#algorithms)
|
27
|
-
5. [Contributing](#contributing)
|
28
|
-
6. [License](#license)
|
29
|
-
7. [Current Status](#current-status)
|
30
|
-
|
31
|
-
## Introduction
|
32
|
-
|
33
|
-
The toolkit provides a set of classes and methods for conducting research in distributional semantics. It groups its methods by the common tasks followed in distributional semantics. Each task has its own substages, which should be followed in order. For the list of tasks see [Algorithms](#algorithms). Fore more information about the different tasks consult the book *Distributional Semantics* by Alessandro Lenci and Magnus Sahlgren
|
34
|
-
|
35
|
-
## Installation
|
36
|
-
|
37
|
-
To install it just run the command:
|
38
|
-
|
39
|
-
```bash
|
40
|
-
pip install dstklib
|
41
|
-
```
|
42
|
-
|
43
|
-
DSTK requires python 3.11 to work.
|
44
|
-
|
45
|
-
# Usage
|
46
|
-
|
47
|
-
The library can be used in three modes:
|
48
|
-
|
49
|
-
## Standalone mode
|
50
|
-
|
51
|
-
In standalone mode you can use the methods individually. Just select the class that contains the method you want to use (without passing any argument) and select the
|
52
|
-
method:
|
53
|
-
|
54
|
-
```python
|
55
|
-
from dstk import TextProcessor
|
56
|
-
|
57
|
-
tokens = ["The", "Quick", "Brown", "Fox", "Jumps", "Over", "The", "Lazy", "Dog"]
|
58
|
-
|
59
|
-
# Do not pass any argument to the class. Doing it will activate workflow mode.
|
60
|
-
# Also, arguments must be keyword arguments. Positional arguments are not supported for methods.
|
61
|
-
lower_tokens = TextProcessor().to_lower(tokens=tokens)
|
62
|
-
|
63
|
-
print(lower_tokens)
|
64
|
-
|
65
|
-
# Output: ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
|
66
|
-
```
|
67
|
-
|
68
|
-
## Workflow mode
|
69
|
-
|
70
|
-
In workflow mode you can chain the desired methods (as long as they followed the order of the stages in which they can be used) and then just call 'result'. To use workflow mode just do:
|
71
|
-
|
72
|
-
```python
|
73
|
-
from dstk import TextProcessor
|
74
|
-
|
75
|
-
text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
|
76
|
-
model = "my_spacy_model"
|
77
|
-
|
78
|
-
# Calling result is important. Otherwise, it will return an instance of the class
|
79
|
-
tokens = TextProcessor(text=text).set_model(model=model).get_tokens().remove_stop_words().get_text().result
|
80
|
-
|
81
|
-
print(tokens)
|
82
|
-
|
83
|
-
# Output: ["quick", "brown", "fox", "jumps", "lazy", "dog", "sun", "sets", "behind", "hills"]
|
84
|
-
```
|
85
|
-
|
86
|
-
### Automate workflows:
|
87
|
-
|
88
|
-
If there is a specific workflow you use multiple times, you can automate it by using WorkflowBuilder. Just input the name of the methods (in the correct order) you use and its correspondent arguments as a dictionary, along witu the class you are using:
|
89
|
-
|
90
|
-
```python
|
91
|
-
from dstk import TextProcessor, WorkflowBuilder
|
92
|
-
|
93
|
-
text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
|
94
|
-
model = "my_spacy_model"
|
95
|
-
|
96
|
-
CustomTextWorkflow = WorkflowBuilder(
|
97
|
-
work_class=TextProcessor,
|
98
|
-
method_representation={
|
99
|
-
"set_model": {"model": model},
|
100
|
-
"get_tokens": {},
|
101
|
-
"remove_stop_words": {},
|
102
|
-
"get_text": {}
|
103
|
-
}
|
104
|
-
)
|
105
|
-
|
106
|
-
# Pass as an argument the input required by the class
|
107
|
-
tokens = CustomWorkflow(text=text)
|
108
|
-
|
109
|
-
print(tokens)
|
110
|
-
|
111
|
-
# Output: ["quick", "brown", "fox", "jumps", "lazy", "dog", "sun", "sets", "behind", "hills"]
|
112
|
-
```
|
113
|
-
|
114
|
-
## Pipeline mode
|
115
|
-
|
116
|
-
A pipeline is just a set of workflows running one after another. If there are a lot of workflows that you constantly use, you can automate the process by using PipelineBuilder. Just pass your workflows as a list:
|
117
|
-
|
118
|
-
```python
|
119
|
-
from dstk import TextProcessor, Collocations, WorkflowBuilder, PipelineBuilder
|
120
|
-
|
121
|
-
text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
|
122
|
-
|
123
|
-
CustomTextWorkflow = WorkflowBuilder(
|
124
|
-
work_class=TextProcessor,
|
125
|
-
method_representation={
|
126
|
-
"set_model": {"model": model},
|
127
|
-
"get_tokens": {},
|
128
|
-
"remove_stop_words": {},
|
129
|
-
"get_text": {}
|
130
|
-
}
|
131
|
-
)
|
132
|
-
|
133
|
-
CustomCollocationsWorkflow = WorkflowBuilder(
|
134
|
-
work_class=Collocations,
|
135
|
-
method_representation={
|
136
|
-
"extract_ngrams": {"target_word": "fox", "window_size": [2, 2]},
|
137
|
-
"count_collocates": {}
|
138
|
-
}
|
139
|
-
)
|
140
|
-
|
141
|
-
CustomPipeline = PipelineBuilder(
|
142
|
-
workflows=[
|
143
|
-
CustomTextWorkflow,
|
144
|
-
CustomCollocationsWorkflow
|
145
|
-
]
|
146
|
-
)
|
147
|
-
|
148
|
-
# Pass as an argument the input required by the class in the first workflow. In this example, the first class is TextProcessor
|
149
|
-
result = CustomPipeline(text=text)
|
150
|
-
|
151
|
-
# Output: Counter({'quick': 1, 'brown': 1, 'jumps': 1, 'over': 1})
|
152
|
-
```
|
153
|
-
|
154
|
-
### Hooks
|
155
|
-
|
156
|
-
You can add hooks (functions with custom logic) to a pipeline. You must only follow two rules:
|
157
|
-
|
158
|
-
1. It must only accept one input and return one output
|
159
|
-
2. The type of its input must be the same as the one returned from the previous workflow. Also, the type it returns must match the input of the next workflow.
|
160
|
-
|
161
|
-
Following these rules you can insert your custom hooks this way:
|
162
|
-
|
163
|
-
```python
|
164
|
-
from dstk import TextProcessor, Collocations, WorkflowBuilder, PipelineBuilder
|
165
|
-
|
166
|
-
text = "The quick brown fox jumps over the lazy dog while the sun sets behind the hills."
|
167
|
-
|
168
|
-
CustomTextWorkflow = WorkflowBuilder(
|
169
|
-
work_class=TextProcessor,
|
170
|
-
method_representation={
|
171
|
-
"set_model": {"model": model},
|
172
|
-
"get_tokens": {},
|
173
|
-
"remove_stop_words": {},
|
174
|
-
"get_text": {}
|
175
|
-
}
|
176
|
-
)
|
177
|
-
|
178
|
-
CustomCollocationsWorkflow = WorkflowBuilder(
|
179
|
-
work_class=Collocations,
|
180
|
-
method_representation={
|
181
|
-
"extract_ngrams": {"target_word": "fox_hook", "window_size": [2, 2]},
|
182
|
-
"count_collocates": {}
|
183
|
-
}
|
184
|
-
)
|
185
|
-
|
186
|
-
def custom_hook(tokens):
|
187
|
-
return [token + "_hook" for token in tokens]
|
188
|
-
|
189
|
-
CustomPipeline = PipelineBuilder(
|
190
|
-
workflows=[
|
191
|
-
CustomTextWorkflow,
|
192
|
-
custom_hook,
|
193
|
-
CustomCollocationsWorkflow
|
194
|
-
]
|
195
|
-
)
|
196
|
-
|
197
|
-
# Pass as an argument the input required by the class in the first workflow. In this example, the first class is TextProcessor
|
198
|
-
result = CustomPipeline(text=text)
|
199
|
-
|
200
|
-
# Output: Counter({'quick_hook': 1, 'brown_hook': 1, 'jumps_hook': 1, 'over_hook': 1})
|
201
|
-
```
|
202
|
-
|
203
|
-
|
204
|
-
# Algorithms
|
205
|
-
|
206
|
-
This library groups its methods by the common tasks commonly followed while doing distributional semantics:
|
207
|
-
|
208
|
-
## Text pre-processing:
|
209
|
-
|
210
|
-
### Class: TextProcessor
|
211
|
-
|
212
|
-
The available methods, grouped by stages, are the following:
|
213
|
-
|
214
|
-
**Stage: start**
|
215
|
-
|
216
|
-
- *set_model*: Takes a text and analyzes it using a language model.
|
217
|
-
|
218
|
-
**Stage: model**
|
219
|
-
|
220
|
-
- *get_tokens*: Returns a list of spaCy tokens from a Doc object.
|
221
|
-
- *get_sentences*: Returns a list containing sentences as strings or as spaCy Span objects.
|
222
|
-
|
223
|
-
**Stage: token_manipulation**
|
224
|
-
|
225
|
-
- *remove_stop_words*: Filters tokens, returning only alphanumeric tokens that are not stop words.
|
226
|
-
- *raw_tokenizer*: Tokenizes a text including punctuation and stop words.
|
227
|
-
- *alphanumeric_raw_tokenizer*: Tokenizes a text including only alphanumeric characters and stop words.
|
228
|
-
- *filter_by_pos*: Returns a list of spaCy tokens filtered by a spacific part-of-speech tag.
|
229
|
-
- *pos_tagger*: Returns a list of (Token, POS) tuples, pairing each token with its part-of-speech tag.
|
230
|
-
- *get_text*: Returns the text content from a list of spaCy tokens, Span objects or list of spaCy tokens.
|
231
|
-
|
232
|
-
**Stage: text_processing**
|
233
|
-
|
234
|
-
- *to_lower*: Returns a list of lower cased words.
|
235
|
-
- *corpus_by_context_window*: Splits the tokens into groups of window_size consecutive words and joins each group into a string.
|
236
|
-
- *get_vocabulary*: Returns the vocabulary a text.
|
237
|
-
- *join*: Joins a list of strings into a single string text.
|
238
|
-
- *save_to_file*: Saves a list of strings or (Token, POS) tuples in the specified path.
|
239
|
-
|
240
|
-
## Find collocations:
|
241
|
-
|
242
|
-
### Class: Collocations
|
243
|
-
|
244
|
-
The available methods, grouped by stages, are the following:
|
245
|
-
|
246
|
-
**Stage: start**
|
247
|
-
|
248
|
-
- *extract_ngrams*: Extracts both the context words of the target collocation, returned as tuples whose lenght corresponds to the specified window_size, and the collocations of the target word, in either directed or undirected manner.
|
249
|
-
|
250
|
-
**Stage: collocates**
|
251
|
-
|
252
|
-
- *count_collocates*: Counts the collocates of the target word.
|
253
|
-
|
254
|
-
**Stage: count**
|
255
|
-
|
256
|
-
- *plot*: Plots the count of the collocates.
|
257
|
-
|
258
|
-
## Build a matrix from a text corpus:
|
259
|
-
|
260
|
-
### Class: TextMatrixBuilder
|
261
|
-
|
262
|
-
The available methods, grouped by stages, are the following:
|
263
|
-
|
264
|
-
**Stage: start**
|
265
|
-
|
266
|
-
- *create_dtm*: Creates Document Term Matrix (DTM).
|
267
|
-
|
268
|
-
**Stage: matrix_operations**
|
269
|
-
|
270
|
-
- *create_co_ocurrence_matrix*: Creates a Co-occurrence matrix.
|
271
|
-
- *to_dataframe*: Creates a dataframe from a matrix representation.
|
272
|
-
|
273
|
-
## Weight the co-occurrence matrix:
|
274
|
-
|
275
|
-
### Class: WeightMatrix
|
276
|
-
|
277
|
-
The available methods, grouped by stages, are the following:
|
278
|
-
|
279
|
-
**Stage: start**
|
280
|
-
|
281
|
-
- *pmi*: Weights a Co-occurrence matrix by PMI or PPMI.
|
282
|
-
- *tf_idf*: Weights a Co-occurrence matrix by Tf-idf.
|
283
|
-
|
284
|
-
## Generate word embeddings from a co-occurrence matrix:
|
285
|
-
|
286
|
-
### Class: CountModels
|
287
|
-
|
288
|
-
The available methods, grouped by stages, are the following:
|
289
|
-
|
290
|
-
**Stage: start**
|
291
|
-
|
292
|
-
- *scale_matrix*: Scales the input matrix to have zero mean and unit variance for each feature.
|
293
|
-
|
294
|
-
**Stage: embeddings**
|
295
|
-
|
296
|
-
- *svd_embeddings*: Generates word embeddings using truncated Single Value Descomposition (SVD).
|
297
|
-
- *pca_embeddings*: Generates word embeddings using Principal Component Analysis (PCA).
|
298
|
-
- *to_dataframe*: Creates a dataframe from a matrix representation.
|
299
|
-
|
300
|
-
## Measure the distance between two words (after generating the word embeddings):
|
301
|
-
|
302
|
-
### Class: GeometricDistance
|
303
|
-
|
304
|
-
The available methods, grouped by stages, are the following:
|
305
|
-
|
306
|
-
**Stage: start**
|
307
|
-
|
308
|
-
- *euclidean_distance*: Computes the Euclidean distance between the embeddings of two words.
|
309
|
-
- *manhattan_distance*: Computes the Manhattan distance between the embeddings of two words.
|
310
|
-
- *cos_similarity*: Computes the cosine similarity between the embeddings of two words.
|
311
|
-
- *nearest_neighbors*: Returns the top N most semantically similar words to a given target word, based on the specified distance or similarity metric.
|
312
|
-
|
313
|
-
## Generate word embeddings using neural networks:
|
314
|
-
|
315
|
-
### Class: PredictModels
|
316
|
-
|
317
|
-
The available methods, grouped by stages, are the following:
|
318
|
-
|
319
|
-
**Stage: start**
|
320
|
-
|
321
|
-
- *word2vec*: Creates word embeddings using the Word2Vec algorithm.
|
322
|
-
- *fastText*: Creates word embeddings using the FastText algorithm.
|
323
|
-
- *load_model*: Loads the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.
|
324
|
-
|
325
|
-
**Stage: predict_model**
|
326
|
-
|
327
|
-
- *save_model*: Saves the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used. Can also be used in stage embeddings_operations.
|
328
|
-
- *nearest_neighbors*: Returns the top N most semantically similar words to a given target word. Can also be used in stage embeddings_operations.
|
329
|
-
- *cos_similarity*: Computes the cosine similarity between the embeddings of two words. Can also be used in stage embeddings_operations.
|
330
|
-
- *to_matrix*: Returns a matrix represenation of the word embeddings and their associated labels.
|
331
|
-
|
332
|
-
## Plot the word embeddings:
|
333
|
-
|
334
|
-
### Class: PlotEmbeddings
|
335
|
-
|
336
|
-
The available methods, grouped by stages, are the following:
|
337
|
-
|
338
|
-
**Stage: start**
|
339
|
-
|
340
|
-
- *elbow_analysis*: Generates an Elbow plot to help determine the optimal number of clusters for the word embeddings.
|
341
|
-
- *extract_silhouette_score*: Extracts and plots the Silhouette score to help determine the optimal number of clusters for the word embeddings.
|
342
|
-
|
343
|
-
**Stage: clusters**
|
344
|
-
|
345
|
-
- *plot_embeddings_2D*: Generates a 2D plot of the word embedddings.
|
346
|
-
- *plot_embeddings_3D*: Generates a 3D plot of the word embedddings.
|
347
|
-
|
348
|
-
## Predefined pipelines
|
349
|
-
|
350
|
-
DSTK has some pipelines included that already cover most of the frequent tasks in distributional semantics:
|
351
|
-
|
352
|
-
- *StandardModel*: This pipeline generates word embeddings using the standard model as defined by (Lenci & Sahlgren 97). It preprocesses the text by removing stop words, lowering the words and segmenting the text using a context window. The co-occurrence matrix is weighted with PPMI and reduced with truncated SVD.
|
353
|
-
- *SGNSModel*: This pipeline generates word embeddings using Skip-Gram with Negative Sampling (SGNS) as defined by (Lenci & Sahlgren 162). It preprocesses the text by extracting the sentences, removing stop words and lowering them. The embeddings are extracted by using word2vec to do SGNS. Returns an instance of PredictModels.
|
354
|
-
|
355
|
-
## Other tools:
|
356
|
-
|
357
|
-
You can also convert from MatrixRepresentation to a dataframe and viceversa using 'matrix_to_dataframe' and 'dataframe_to_matrix' from matrix_base.
|
358
|
-
|
359
|
-
# Contributing
|
360
|
-
|
361
|
-
I welcome contributions to improve this toolkit. If you have ideas or fixes, feel free to fork the repository and submit a pull request. Here are some ways you can help:
|
362
|
-
|
363
|
-
* Report bugs or issues.
|
364
|
-
|
365
|
-
* Suggest new features or algorithms to add.
|
366
|
-
|
367
|
-
# License
|
368
|
-
|
369
|
-
This project is licensed under the GPL-3 License - see the [LICENSE](https://gitlab.com/CesarACabrera/distributional-semantics-toolkit/-/blob/master/LICENSE?ref_type=heads) file for details.
|
dstklib-1.0.2.dist-info/RECORD
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
dstk/__init__.py,sha256=X58NNxR1EZ3CCpPMM569T_O-DUNGNjhpe6UoSn7Rrn0,356
|
2
|
-
dstk/collocations.py,sha256=fn47xrVHW1cgROMSLRbH5M9YxdNvO35tOIKCrXbpJvc,4985
|
3
|
-
dstk/count_models.py,sha256=D4OINB2Z2jUTyN3scLuNGREV0jHnvcITST3l-kHV9Jo,5489
|
4
|
-
dstk/geometric_distance.py,sha256=ddlUFv1nl0OKRgNb9_6fFf7loCDC-CQMDffQ6k0lxvE,5107
|
5
|
-
dstk/matrix_base.py,sha256=zLTM05dYUu7M7WUTV8i7ZQdDkJZ65iddR3ksd4Gy4T4,4199
|
6
|
-
dstk/pipeline_tools.py,sha256=ve3MfzZMoff6EQrJGyrlLW6ZQHWHEcGJSC_5aMpangk,933
|
7
|
-
dstk/pipelines.py,sha256=Za7DuCHSaZBSxzJyMOWHgTH3jm0aYkW4lav-nIMNdZo,4530
|
8
|
-
dstk/plot_embeddings.py,sha256=gfjXJGulRbV3uNC45gqMQoJN4YmfP5VVlpKB6eV7PTU,11210
|
9
|
-
dstk/predict_models.py,sha256=Cfv3ykzN5-ZIaeuqea0fkjZUy-bFHaDafwWdriBAwe0,8290
|
10
|
-
dstk/text_matrix_builder.py,sha256=VBs6pDS-YLAw0cuwVRHjgYHRXCNoo449p2IDl1-dKso,3405
|
11
|
-
dstk/text_processor.py,sha256=tHstoWJIHyC0s0zs0B8PIN2_L9woN7GlkiBPVixYsX4,18433
|
12
|
-
dstk/weight_matrix.py,sha256=wPBZeNro2ceSQukFzov_6HRFtIklEC_tDVs1Ze-KKQM,2346
|
13
|
-
dstk/workflow_tools.py,sha256=62zH2N91V9OYT0P9Jk97ahTmevnjmGF1Yda2BoQtSrU,11574
|
14
|
-
dstk/lib_types/__init__.py,sha256=Ka4bfePHC9HWUTiACBdEsaU8Go2J-C1D7ixeMG89lm4,252
|
15
|
-
dstk/lib_types/dstk_types.py,sha256=KCQwKav65nAD4VV1kixcWyF-m32HOKjbG0JO0Z6Vjsg,1011
|
16
|
-
dstk/lib_types/fasttext_types.py,sha256=5LXE77kgCPJHRx0zXlLTs7wRIQOGZiz30Pq0trIXcBA,51
|
17
|
-
dstk/lib_types/gensim_types.py,sha256=tg3OASG_EWuqFQw_pKM4HNjRk1yrMnmlBqdKm-orxag,34
|
18
|
-
dstk/lib_types/matplotlib_types.py,sha256=FSP2c6ryTscbuES7w1ccTtcMS1g3k_m-zD7ZlLkfb6I,177
|
19
|
-
dstk/lib_types/nltk_types.py,sha256=s_UVeJWIEmh2tzvS3ttuRjWXo84quMjIrm4OQf3vms4,21
|
20
|
-
dstk/lib_types/numpy_types.py,sha256=zxgVrHcRJ-_NGO3LE1aba0d4JQDLYN26us5ljlhIq7E,64
|
21
|
-
dstk/lib_types/pandas_types.py,sha256=bR27h-xyZ3FccROIHxqYpVvqMNoi1bvIzpq25cf8kkg,43
|
22
|
-
dstk/lib_types/sklearn_types.py,sha256=W59yIEkZM_E_tW061x1bY-LpRC2aCzLgtYmXANNSN3Q,47
|
23
|
-
dstk/lib_types/spacy_types.py,sha256=hUiaw4AywSW8o42h5lp3t6a4yosG_GasdJX2RCKgW7o,125
|
24
|
-
dstklib-1.0.2.dist-info/LICENSE,sha256=LpSgNPBfwn5F4CVhnTbhpiX2f0YgRMzGWQ7Sphuuwuc,35139
|
25
|
-
dstklib-1.0.2.dist-info/METADATA,sha256=2tOODcd3SODYs1iC49Y2bFjqAHn1D5U3ShqzE6A9LXQ,13309
|
26
|
-
dstklib-1.0.2.dist-info/WHEEL,sha256=VyG4dJCdJcxE1baiVBm9NET3Nj7Wne1lZZq7UFNxRpg,97
|
27
|
-
dstklib-1.0.2.dist-info/top_level.txt,sha256=b_MNmKso0-ra2M7snsy5fZBW-l9MItjrwMYBd-tiOYo,5
|
28
|
-
dstklib-1.0.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|