dstklib 1.0.1__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstk/__init__.py +10 -12
- dstk/adaptors/__init__.py +2 -0
- dstk/adaptors/adaptors.py +91 -0
- dstk/adaptors/typeguards.py +141 -0
- dstk/hooks/__init__.py +2 -0
- dstk/hooks/hook_tools.py +89 -0
- dstk/hooks/type_conversion.py +40 -0
- dstk/lib_types/__init__.py +2 -3
- dstk/lib_types/dstk_types.py +188 -16
- dstk/lib_types/plotly_types.py +1 -0
- dstk/method_index.py +32 -0
- dstk/models/__init__.py +2 -0
- dstk/models/model_tools.py +83 -0
- dstk/models/models.py +191 -0
- dstk/modules/__init__.py +10 -0
- dstk/modules/count_models.py +91 -0
- dstk/modules/data_visualization/__init__.py +2 -0
- dstk/modules/data_visualization/clustering.py +129 -0
- dstk/modules/data_visualization/embeddings.py +101 -0
- dstk/modules/geometric_distance.py +114 -0
- dstk/modules/ngrams.py +156 -0
- dstk/modules/predict_models.py +109 -0
- dstk/modules/text_matrix_builder.py +55 -0
- dstk/modules/text_processor.py +100 -0
- dstk/modules/tokenizer.py +139 -0
- dstk/modules/weight_matrix.py +65 -0
- dstk/templates/__init__.py +2 -0
- dstk/templates/rules.py +59 -0
- dstk/templates/templates.py +231 -0
- dstk/workflows/__init__.py +2 -0
- dstk/workflows/stage_workflows.py +55 -0
- dstk/workflows/workflow_tools.py +383 -0
- dstklib-2.0.0.dist-info/METADATA +377 -0
- dstklib-2.0.0.dist-info/RECORD +43 -0
- dstk/collocations.py +0 -121
- dstk/count_models.py +0 -112
- dstk/geometric_distance.py +0 -107
- dstk/lib_types/matplotlib_types.py +0 -4
- dstk/lib_types/nltk_types.py +0 -1
- dstk/matrix_base.py +0 -113
- dstk/pipeline_tools.py +0 -27
- dstk/pipelines.py +0 -114
- dstk/plot_embeddings.py +0 -240
- dstk/predict_models.py +0 -189
- dstk/text_matrix_builder.py +0 -87
- dstk/text_processor.py +0 -450
- dstk/weight_matrix.py +0 -71
- dstk/workflow_tools.py +0 -257
- dstklib-1.0.1.dist-info/METADATA +0 -360
- dstklib-1.0.1.dist-info/RECORD +0 -28
- {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/LICENSE +0 -0
- {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/WHEEL +0 -0
- {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,383 @@
|
|
1
|
+
"""
|
2
|
+
This module provides classes for defining, validating, and executing complex workflows composed of multiple processing steps and stages. It supports dynamic method invocation from specified modules, workflow validation against templates with type and step rules, and optional method wrapping for object-oriented usage.
|
3
|
+
|
4
|
+
Key components:
|
5
|
+
|
6
|
+
* Wrapper: Simple container for input data, enabling method injection.
|
7
|
+
* WorkflowBuilder: Automates sequential execution of methods in a single workflow, including validation and optional wrapping.
|
8
|
+
* StageWorkflowBuilder: Manages multiple workflows organized in stages and modules, enforcing stage/module constraints and chaining workflows.
|
9
|
+
|
10
|
+
This module is designed to facilitate building flexible, validated processing workflows with dynamic and modular behavior.
|
11
|
+
"""
|
12
|
+
|
13
|
+
import importlib
|
14
|
+
from ..templates.rules import TypeRules
|
15
|
+
from ..adaptors import accepts_sentences_and_collocates, accepts_tags, is_workflow
|
16
|
+
from types import ModuleType
|
17
|
+
import warnings
|
18
|
+
from functools import wraps
|
19
|
+
|
20
|
+
from typing import Any, Callable, TypeVar, ParamSpec, Concatenate
|
21
|
+
from ..lib_types import Workflow, WorkflowTemplate, StageWorkflow, StepResult, StepConfig, RulesTemplate, StageTemplate, StageModules, StepGenerator, ResultGenerator
|
22
|
+
|
23
|
+
P = ParamSpec("P")
|
24
|
+
R = TypeVar("R")
|
25
|
+
|
26
|
+
class Wrapper:
|
27
|
+
def __init__(self, input_data: Any):
|
28
|
+
"""
|
29
|
+
A simple wrapper class that stores input data.
|
30
|
+
|
31
|
+
:param input_data: Any data to be wrapped and stored internally.
|
32
|
+
:type input_data: Any
|
33
|
+
"""
|
34
|
+
self._input_data: Any = input_data
|
35
|
+
|
36
|
+
class WorkflowBuilder:
|
37
|
+
"""
|
38
|
+
Automates the execution of a sequence of methods as a workflow.
|
39
|
+
|
40
|
+
This class dynamically imports and executes a chain of methods defined in a workflow, optionally validates the workflow against a template, and can wrap methods for object-oriented style usage.
|
41
|
+
|
42
|
+
:param name: Name of the workflow instance.
|
43
|
+
:type name: str
|
44
|
+
:param module_name: Name of the module containing the methods to be executed.
|
45
|
+
:type module_name: str
|
46
|
+
:param workflow: A workflow definition, a list of dicts mapping method names to kwargs.
|
47
|
+
:type workflow: Workflow
|
48
|
+
:param template: Optional workflow template for validation and typing rules. Defaults to None
|
49
|
+
:type template: WorkflowTemplate or None
|
50
|
+
:param wrapper: If True, creates a Wrapper instance allowing method calls as object methods with internal data injection. Defaults to False.
|
51
|
+
|
52
|
+
Usage:
|
53
|
+
|
54
|
+
.. code-block:: python
|
55
|
+
|
56
|
+
CustomWorkflow = WorkflowBuilder(...)
|
57
|
+
result = CustomWorkflow(input_data)
|
58
|
+
"""
|
59
|
+
|
60
|
+
def __init__(self, name: str, module_name: str, workflow: Workflow, template: WorkflowTemplate | None = None, wrapper: bool = False) -> None:
|
61
|
+
"""
|
62
|
+
Initializes WorkflowBuilder with given attributes.
|
63
|
+
"""
|
64
|
+
|
65
|
+
self.name: str = name
|
66
|
+
self.module_name: str = module_name
|
67
|
+
self.methods: Workflow = workflow
|
68
|
+
self.template: WorkflowTemplate | None = template
|
69
|
+
self.current_types: list[str] = []
|
70
|
+
self.wrap: bool = wrapper
|
71
|
+
|
72
|
+
def _run_methods(self, input_data: Any) -> StepGenerator:
|
73
|
+
"""
|
74
|
+
Executes the sequence of methods in the workflow on the input data.
|
75
|
+
|
76
|
+
For modules 'tokenizer' and 'text_processor' (except 'save_to_file'),
|
77
|
+
the method is wrapped to accept sentences and tags.
|
78
|
+
|
79
|
+
:param input_data: The input data to process through the workflow.
|
80
|
+
:type input_data: Any
|
81
|
+
|
82
|
+
:return: A generator yielding StepResult instances containing method names and their results.
|
83
|
+
:rtype: StepGenerator
|
84
|
+
"""
|
85
|
+
input_output: Any = input_data
|
86
|
+
|
87
|
+
for method_dict in self.methods:
|
88
|
+
method_name, kwargs = next(iter(method_dict.items()))
|
89
|
+
|
90
|
+
module: ModuleType = importlib.import_module(f"dstk.modules.{self.module_name}")
|
91
|
+
|
92
|
+
method: Callable = getattr(module, method_name)
|
93
|
+
|
94
|
+
if self.module_name in ("tokenizer", "text_processor") and method_name not in ["save_to_file"]:
|
95
|
+
input_output = accepts_sentences_and_collocates(accepts_tags(method))(input_output, **kwargs)
|
96
|
+
else:
|
97
|
+
input_output = method(input_output, **kwargs)
|
98
|
+
|
99
|
+
yield StepResult(name=method_name, result=input_output)
|
100
|
+
|
101
|
+
def __call__(self, input_data: Any, return_methods: list[str] | None = None, return_all: bool = False) -> StepGenerator | ResultGenerator | Wrapper | Any:
|
102
|
+
"""
|
103
|
+
Executes the workflow on the given input data.
|
104
|
+
|
105
|
+
Depending on parameters, can return results of specific methods, all method results as a generator, the final result, or a Wrapper instance.
|
106
|
+
|
107
|
+
:param input_data: Data to be processed by the workflow.
|
108
|
+
:type input_data: Any
|
109
|
+
:param return_methods: If specified, only results of these methods are returned. Defaults to None
|
110
|
+
:type return_methods: list[str] or None
|
111
|
+
:param return_all: If True, returns a generator for all method results.Defaults to None
|
112
|
+
|
113
|
+
:return: Depending on parameters:
|
114
|
+
* Wrapper instance if wrap=True,
|
115
|
+
* Generator of selected/all method results,
|
116
|
+
* Final processed result otherwise.
|
117
|
+
:rtype: StepGenerator | ResultGenerator | Wrapper | Any
|
118
|
+
|
119
|
+
:raises ValueError: If the workflow format is invalid.
|
120
|
+
"""
|
121
|
+
|
122
|
+
if self.wrap: # Maybe validate workflow too?
|
123
|
+
for method_dict in self.methods:
|
124
|
+
method_name, kwargs = next(iter(method_dict.items()))
|
125
|
+
|
126
|
+
if kwargs:
|
127
|
+
warnings.warn("Because you set wrapper=True, the arguments you passed to the methods in the workflow will be ignored.")
|
128
|
+
|
129
|
+
module: ModuleType = importlib.import_module(f"dstk.modules.{self.module_name}")
|
130
|
+
|
131
|
+
method: Callable = getattr(module, method_name)
|
132
|
+
|
133
|
+
def inject_data(func: Callable[P, R]) -> Callable[Concatenate[Any, P], R]:
|
134
|
+
@wraps(func)
|
135
|
+
def wrapper(self, *args: P.args, **kwargs: P.kwargs) -> R:
|
136
|
+
return func(self._input_data, *args, **kwargs)
|
137
|
+
return wrapper
|
138
|
+
|
139
|
+
setattr(Wrapper, method_name, inject_data(method))
|
140
|
+
|
141
|
+
return Wrapper(input_data)
|
142
|
+
|
143
|
+
if not is_workflow(self.methods):
|
144
|
+
raise ValueError("The workflow provided does not follow the right format. Please enter a valid workflow")
|
145
|
+
|
146
|
+
if self.template:
|
147
|
+
is_valid: bool = self._validate_workflow(base_type=self.template["base_type"])
|
148
|
+
|
149
|
+
if return_methods:
|
150
|
+
return (result for name, result in self._run_methods(input_data) if name in return_methods)
|
151
|
+
elif return_all:
|
152
|
+
return self._run_methods(input_data)
|
153
|
+
else:
|
154
|
+
result: Any = input_data
|
155
|
+
for _, result in self._run_methods(input_data):
|
156
|
+
pass
|
157
|
+
return result
|
158
|
+
|
159
|
+
|
160
|
+
def _validate_workflow(self, base_type: str) -> bool:
|
161
|
+
"""
|
162
|
+
Validates the workflow against the given template and type rules.
|
163
|
+
|
164
|
+
Ensures methods are used in valid steps, not repeated improperly,
|
165
|
+
and conform to chaining and inclusion/exclusion rules.
|
166
|
+
|
167
|
+
:param base_type: The starting type of the workflow.
|
168
|
+
:type base_type: str
|
169
|
+
|
170
|
+
:return: True if workflow passes validation, otherwise raises RuntimeError.
|
171
|
+
:rtype: bool
|
172
|
+
|
173
|
+
:raises ValueError: If no template is provided.
|
174
|
+
"""
|
175
|
+
current_line: int = 0
|
176
|
+
current_step: int = 0
|
177
|
+
excluded_methods: dict[str, str] = {}
|
178
|
+
self.current_types.extend([base_type])
|
179
|
+
|
180
|
+
template: WorkflowTemplate | None = self.template
|
181
|
+
|
182
|
+
if template is None:
|
183
|
+
raise ValueError(f"A template was not provided for this module.")
|
184
|
+
|
185
|
+
methods: list[str] = [list(method.keys())[0] for method in self.methods]
|
186
|
+
steps: list[int] = list(template["steps"].keys())
|
187
|
+
|
188
|
+
while current_line < len(methods):
|
189
|
+
current_method: str = methods[current_line]
|
190
|
+
step_data: StepConfig = template["steps"][current_step]
|
191
|
+
|
192
|
+
for data_type in self.current_types:
|
193
|
+
if data_type in TypeRules:
|
194
|
+
if self.module_name in TypeRules[data_type]:
|
195
|
+
if TypeRules[data_type][self.module_name]["exclude"] == "*":
|
196
|
+
raise RuntimeError(f"You cannot choose the methods from {self.module_name} because you are currently processing by {self.current_types}")
|
197
|
+
|
198
|
+
if "*" in excluded_methods and not step_data["chaining"]:
|
199
|
+
raise RuntimeError(f"You cannot use method {current_method} because {excluded_methods['*']}")
|
200
|
+
|
201
|
+
if current_method in excluded_methods:
|
202
|
+
raise RuntimeError(f"You cannot use method {current_method} because {excluded_methods[current_method]}")
|
203
|
+
|
204
|
+
if "include" in step_data:
|
205
|
+
included_methods: list[str] | str = step_data["include"]
|
206
|
+
|
207
|
+
if included_methods != "*":
|
208
|
+
if current_method not in included_methods:
|
209
|
+
raise RuntimeError(f"The method on step {template['steps'][current_step]['step_name']} must be {included_methods}. Instead, got method {current_method}")
|
210
|
+
|
211
|
+
for included_method in included_methods:
|
212
|
+
excluded_methods[included_method] = f"it can ony be used on step {template['steps'][current_step]['step_name']}"
|
213
|
+
else:
|
214
|
+
excluded_methods["*"] = f"you can select only one method from this module."
|
215
|
+
else:
|
216
|
+
step_excluded_methods: dict[str, int] = step_data["exclude"]
|
217
|
+
if current_method in step_excluded_methods:
|
218
|
+
raise RuntimeError(f"You cannot use method {current_method} because it can only be used in step {template['steps'][step_excluded_methods[current_method]]['step_name']}")
|
219
|
+
|
220
|
+
if not step_data["repeat"]:
|
221
|
+
excluded_methods[current_method] = "this method cannot be used twice"
|
222
|
+
|
223
|
+
self._trigger_type(current_method, excluded_methods)
|
224
|
+
|
225
|
+
current_line += 1
|
226
|
+
|
227
|
+
if current_step < len(steps) - 1:
|
228
|
+
current_step += 1
|
229
|
+
else:
|
230
|
+
pass
|
231
|
+
|
232
|
+
return True
|
233
|
+
|
234
|
+
def _trigger_type(self, method_name: str, excluded_methods: dict[str, str]) -> None:
|
235
|
+
"""
|
236
|
+
Updates the current data types and excluded methods based on the triggered method.
|
237
|
+
|
238
|
+
:param method_name: The method that triggers a type change.
|
239
|
+
:type method_name: str
|
240
|
+
:param excluded_methods: Dictionary tracking excluded methods and their reasons.
|
241
|
+
:type excluded_methods: dict[str, str]
|
242
|
+
|
243
|
+
:raises ValueError: If no template is provided.
|
244
|
+
"""
|
245
|
+
template: WorkflowTemplate | None = self.template
|
246
|
+
|
247
|
+
if template is None:
|
248
|
+
raise ValueError(f"A template was not provided. Please provide a valid template")
|
249
|
+
|
250
|
+
triggers: dict[str, str] = template["triggers"]
|
251
|
+
|
252
|
+
if method_name in triggers and self.module_name:
|
253
|
+
rules: RulesTemplate = TypeRules[triggers[method_name]]
|
254
|
+
|
255
|
+
if template["base_type"] in self.current_types:
|
256
|
+
self.current_types.pop()
|
257
|
+
|
258
|
+
self.current_types.append(triggers[method_name])
|
259
|
+
|
260
|
+
if self.module_name in rules:
|
261
|
+
method_rules: list[str] | str = rules[self.module_name]["exclude"]
|
262
|
+
|
263
|
+
for method in method_rules:
|
264
|
+
excluded_methods[method] = f"you are currently processing by {self.current_types}"
|
265
|
+
|
266
|
+
|
267
|
+
class StageWorkflowBuilder:
|
268
|
+
"""
|
269
|
+
Manages and runs workflows composed of multiple stages and modules.
|
270
|
+
|
271
|
+
Allows sequential execution of workflows associated with various modules/stages, validating and chaining them according to provided templates and configurations.
|
272
|
+
|
273
|
+
:param templates: A mapping of module names to their workflow templates.
|
274
|
+
:type templates: StageTemplate
|
275
|
+
:param stage_modules: A mapping of stage indices to allowed module names.
|
276
|
+
:type stage_modules: StageModules
|
277
|
+
:param name: Name of the stage workflow builder instance.
|
278
|
+
:type name: str
|
279
|
+
:param workflows: A mapping of module names to their workflows.
|
280
|
+
:type workflows: StageWorkflow
|
281
|
+
"""
|
282
|
+
|
283
|
+
def __init__(self, templates: StageTemplate, stage_modules: StageModules, name: str, workflows: StageWorkflow):
|
284
|
+
"""
|
285
|
+
Initializes StageWorkflowBuilder with given attributes.
|
286
|
+
"""
|
287
|
+
|
288
|
+
self.name: str = name
|
289
|
+
self.stage_workflows: StageWorkflow = workflows
|
290
|
+
self.templates: StageTemplate = templates
|
291
|
+
self.stage_modules: StageModules = stage_modules
|
292
|
+
self.workflows: list[WorkflowBuilder] = []
|
293
|
+
|
294
|
+
def _run_workflow(self, input_data: Any) -> StepGenerator:
|
295
|
+
"""
|
296
|
+
Executes all workflows in sequence on the input data.
|
297
|
+
|
298
|
+
:param input_data: The input data to process through the workflow.
|
299
|
+
:type input_data: Any
|
300
|
+
|
301
|
+
:return: A generator yielding StepResult instances containing workflow names and their results.
|
302
|
+
:rtype: StepGenerator
|
303
|
+
"""
|
304
|
+
result: Any = input_data
|
305
|
+
|
306
|
+
for workflow in self.workflows:
|
307
|
+
result = workflow(input_data=result)
|
308
|
+
|
309
|
+
yield StepResult(name=workflow.name, result=result)
|
310
|
+
|
311
|
+
|
312
|
+
def __call__(self, input_data: Any, return_modules: list[str] | None = None, return_all: bool = False) -> ResultGenerator | StepGenerator | Any:
|
313
|
+
"""
|
314
|
+
Runs the staged workflows on the input data.
|
315
|
+
|
316
|
+
Checks stage/module compatibility, validates workflows against templates, and returns results based on parameters.
|
317
|
+
|
318
|
+
:param input_data: Input data to process.
|
319
|
+
:type input_data: Any
|
320
|
+
:param return_modules: If provided, yields results only for these modules. Defaults to None
|
321
|
+
:type return_modules: list[str] or None
|
322
|
+
:param return_all: If True, yields results for all modules. Defaults to False.
|
323
|
+
:type return_all: bool
|
324
|
+
|
325
|
+
:return: Final result, or a generator of step/module results.
|
326
|
+
:rtype: ResultGenerator | StepGenerator | Any
|
327
|
+
|
328
|
+
:raises RuntimeError: If a module is used in an incorrect stage.
|
329
|
+
:raises ValueError: If any workflow is invalid or template is missing.
|
330
|
+
"""
|
331
|
+
|
332
|
+
self.workflows = []
|
333
|
+
|
334
|
+
max_stage: int = max(self.stage_modules)
|
335
|
+
|
336
|
+
current_stage: int = 0
|
337
|
+
|
338
|
+
for module in self.stage_workflows:
|
339
|
+
|
340
|
+
allowed_modules: set[str] = self.stage_modules[current_stage]
|
341
|
+
|
342
|
+
if module not in allowed_modules:
|
343
|
+
raise RuntimeError(f"The module on on stage {current_stage} must be one of the following {allowed_modules}")
|
344
|
+
|
345
|
+
if current_stage < max_stage:
|
346
|
+
current_stage += 1
|
347
|
+
|
348
|
+
if not is_workflow(self.stage_workflows[module]):
|
349
|
+
raise ValueError(f"Please enter a valid workflow for module {module}")
|
350
|
+
|
351
|
+
module_workflow: WorkflowBuilder = WorkflowBuilder(
|
352
|
+
name=f"{module}",
|
353
|
+
template=self.templates[module],
|
354
|
+
module_name=module,
|
355
|
+
workflow=self.stage_workflows[module]
|
356
|
+
)
|
357
|
+
|
358
|
+
if self.workflows:
|
359
|
+
module_workflow.current_types = self.workflows[-1].current_types
|
360
|
+
|
361
|
+
template: WorkflowTemplate | None = module_workflow.template
|
362
|
+
|
363
|
+
if template is None:
|
364
|
+
raise ValueError(f"A template for module {module} not provided")
|
365
|
+
|
366
|
+
module_workflow._validate_workflow(base_type=template["base_type"])
|
367
|
+
|
368
|
+
self.workflows.append(module_workflow)
|
369
|
+
|
370
|
+
if return_modules:
|
371
|
+
return (result for name, result in self._run_workflow(input_data) if name in return_modules)
|
372
|
+
elif return_all:
|
373
|
+
return self._run_workflow(input_data)
|
374
|
+
else:
|
375
|
+
result = input_data
|
376
|
+
|
377
|
+
for _, result in self._run_workflow(input_data):
|
378
|
+
pass
|
379
|
+
|
380
|
+
return result
|
381
|
+
|
382
|
+
|
383
|
+
|