dstklib 1.0.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. dstk/__init__.py +10 -12
  2. dstk/adaptors/__init__.py +2 -0
  3. dstk/adaptors/adaptors.py +91 -0
  4. dstk/adaptors/typeguards.py +141 -0
  5. dstk/hooks/__init__.py +2 -0
  6. dstk/hooks/hook_tools.py +89 -0
  7. dstk/hooks/type_conversion.py +40 -0
  8. dstk/lib_types/__init__.py +2 -3
  9. dstk/lib_types/dstk_types.py +188 -16
  10. dstk/lib_types/plotly_types.py +1 -0
  11. dstk/method_index.py +32 -0
  12. dstk/models/__init__.py +2 -0
  13. dstk/models/model_tools.py +83 -0
  14. dstk/models/models.py +191 -0
  15. dstk/modules/__init__.py +10 -0
  16. dstk/modules/count_models.py +91 -0
  17. dstk/modules/data_visualization/__init__.py +2 -0
  18. dstk/modules/data_visualization/clustering.py +129 -0
  19. dstk/modules/data_visualization/embeddings.py +101 -0
  20. dstk/modules/geometric_distance.py +114 -0
  21. dstk/modules/ngrams.py +156 -0
  22. dstk/modules/predict_models.py +109 -0
  23. dstk/modules/text_matrix_builder.py +55 -0
  24. dstk/modules/text_processor.py +100 -0
  25. dstk/modules/tokenizer.py +139 -0
  26. dstk/modules/weight_matrix.py +65 -0
  27. dstk/templates/__init__.py +2 -0
  28. dstk/templates/rules.py +59 -0
  29. dstk/templates/templates.py +231 -0
  30. dstk/workflows/__init__.py +2 -0
  31. dstk/workflows/stage_workflows.py +55 -0
  32. dstk/workflows/workflow_tools.py +383 -0
  33. dstklib-2.0.0.dist-info/METADATA +377 -0
  34. dstklib-2.0.0.dist-info/RECORD +43 -0
  35. dstk/collocations.py +0 -121
  36. dstk/count_models.py +0 -112
  37. dstk/geometric_distance.py +0 -107
  38. dstk/lib_types/matplotlib_types.py +0 -4
  39. dstk/lib_types/nltk_types.py +0 -1
  40. dstk/matrix_base.py +0 -113
  41. dstk/pipeline_tools.py +0 -27
  42. dstk/pipelines.py +0 -114
  43. dstk/plot_embeddings.py +0 -240
  44. dstk/predict_models.py +0 -189
  45. dstk/text_matrix_builder.py +0 -87
  46. dstk/text_processor.py +0 -450
  47. dstk/weight_matrix.py +0 -71
  48. dstk/workflow_tools.py +0 -257
  49. dstklib-1.0.1.dist-info/METADATA +0 -360
  50. dstklib-1.0.1.dist-info/RECORD +0 -28
  51. {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/LICENSE +0 -0
  52. {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/WHEEL +0 -0
  53. {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,383 @@
1
+ """
2
+ This module provides classes for defining, validating, and executing complex workflows composed of multiple processing steps and stages. It supports dynamic method invocation from specified modules, workflow validation against templates with type and step rules, and optional method wrapping for object-oriented usage.
3
+
4
+ Key components:
5
+
6
+ * Wrapper: Simple container for input data, enabling method injection.
7
+ * WorkflowBuilder: Automates sequential execution of methods in a single workflow, including validation and optional wrapping.
8
+ * StageWorkflowBuilder: Manages multiple workflows organized in stages and modules, enforcing stage/module constraints and chaining workflows.
9
+
10
+ This module is designed to facilitate building flexible, validated processing workflows with dynamic and modular behavior.
11
+ """
12
+
13
+ import importlib
14
+ from ..templates.rules import TypeRules
15
+ from ..adaptors import accepts_sentences_and_collocates, accepts_tags, is_workflow
16
+ from types import ModuleType
17
+ import warnings
18
+ from functools import wraps
19
+
20
+ from typing import Any, Callable, TypeVar, ParamSpec, Concatenate
21
+ from ..lib_types import Workflow, WorkflowTemplate, StageWorkflow, StepResult, StepConfig, RulesTemplate, StageTemplate, StageModules, StepGenerator, ResultGenerator
22
+
23
+ P = ParamSpec("P")
24
+ R = TypeVar("R")
25
+
26
+ class Wrapper:
27
+ def __init__(self, input_data: Any):
28
+ """
29
+ A simple wrapper class that stores input data.
30
+
31
+ :param input_data: Any data to be wrapped and stored internally.
32
+ :type input_data: Any
33
+ """
34
+ self._input_data: Any = input_data
35
+
36
+ class WorkflowBuilder:
37
+ """
38
+ Automates the execution of a sequence of methods as a workflow.
39
+
40
+ This class dynamically imports and executes a chain of methods defined in a workflow, optionally validates the workflow against a template, and can wrap methods for object-oriented style usage.
41
+
42
+ :param name: Name of the workflow instance.
43
+ :type name: str
44
+ :param module_name: Name of the module containing the methods to be executed.
45
+ :type module_name: str
46
+ :param workflow: A workflow definition, a list of dicts mapping method names to kwargs.
47
+ :type workflow: Workflow
48
+ :param template: Optional workflow template for validation and typing rules. Defaults to None
49
+ :type template: WorkflowTemplate or None
50
+ :param wrapper: If True, creates a Wrapper instance allowing method calls as object methods with internal data injection. Defaults to False.
51
+
52
+ Usage:
53
+
54
+ .. code-block:: python
55
+
56
+ CustomWorkflow = WorkflowBuilder(...)
57
+ result = CustomWorkflow(input_data)
58
+ """
59
+
60
+ def __init__(self, name: str, module_name: str, workflow: Workflow, template: WorkflowTemplate | None = None, wrapper: bool = False) -> None:
61
+ """
62
+ Initializes WorkflowBuilder with given attributes.
63
+ """
64
+
65
+ self.name: str = name
66
+ self.module_name: str = module_name
67
+ self.methods: Workflow = workflow
68
+ self.template: WorkflowTemplate | None = template
69
+ self.current_types: list[str] = []
70
+ self.wrap: bool = wrapper
71
+
72
+ def _run_methods(self, input_data: Any) -> StepGenerator:
73
+ """
74
+ Executes the sequence of methods in the workflow on the input data.
75
+
76
+ For modules 'tokenizer' and 'text_processor' (except 'save_to_file'),
77
+ the method is wrapped to accept sentences and tags.
78
+
79
+ :param input_data: The input data to process through the workflow.
80
+ :type input_data: Any
81
+
82
+ :return: A generator yielding StepResult instances containing method names and their results.
83
+ :rtype: StepGenerator
84
+ """
85
+ input_output: Any = input_data
86
+
87
+ for method_dict in self.methods:
88
+ method_name, kwargs = next(iter(method_dict.items()))
89
+
90
+ module: ModuleType = importlib.import_module(f"dstk.modules.{self.module_name}")
91
+
92
+ method: Callable = getattr(module, method_name)
93
+
94
+ if self.module_name in ("tokenizer", "text_processor") and method_name not in ["save_to_file"]:
95
+ input_output = accepts_sentences_and_collocates(accepts_tags(method))(input_output, **kwargs)
96
+ else:
97
+ input_output = method(input_output, **kwargs)
98
+
99
+ yield StepResult(name=method_name, result=input_output)
100
+
101
+ def __call__(self, input_data: Any, return_methods: list[str] | None = None, return_all: bool = False) -> StepGenerator | ResultGenerator | Wrapper | Any:
102
+ """
103
+ Executes the workflow on the given input data.
104
+
105
+ Depending on parameters, can return results of specific methods, all method results as a generator, the final result, or a Wrapper instance.
106
+
107
+ :param input_data: Data to be processed by the workflow.
108
+ :type input_data: Any
109
+ :param return_methods: If specified, only results of these methods are returned. Defaults to None
110
+ :type return_methods: list[str] or None
111
+ :param return_all: If True, returns a generator for all method results.Defaults to None
112
+
113
+ :return: Depending on parameters:
114
+ * Wrapper instance if wrap=True,
115
+ * Generator of selected/all method results,
116
+ * Final processed result otherwise.
117
+ :rtype: StepGenerator | ResultGenerator | Wrapper | Any
118
+
119
+ :raises ValueError: If the workflow format is invalid.
120
+ """
121
+
122
+ if self.wrap: # Maybe validate workflow too?
123
+ for method_dict in self.methods:
124
+ method_name, kwargs = next(iter(method_dict.items()))
125
+
126
+ if kwargs:
127
+ warnings.warn("Because you set wrapper=True, the arguments you passed to the methods in the workflow will be ignored.")
128
+
129
+ module: ModuleType = importlib.import_module(f"dstk.modules.{self.module_name}")
130
+
131
+ method: Callable = getattr(module, method_name)
132
+
133
+ def inject_data(func: Callable[P, R]) -> Callable[Concatenate[Any, P], R]:
134
+ @wraps(func)
135
+ def wrapper(self, *args: P.args, **kwargs: P.kwargs) -> R:
136
+ return func(self._input_data, *args, **kwargs)
137
+ return wrapper
138
+
139
+ setattr(Wrapper, method_name, inject_data(method))
140
+
141
+ return Wrapper(input_data)
142
+
143
+ if not is_workflow(self.methods):
144
+ raise ValueError("The workflow provided does not follow the right format. Please enter a valid workflow")
145
+
146
+ if self.template:
147
+ is_valid: bool = self._validate_workflow(base_type=self.template["base_type"])
148
+
149
+ if return_methods:
150
+ return (result for name, result in self._run_methods(input_data) if name in return_methods)
151
+ elif return_all:
152
+ return self._run_methods(input_data)
153
+ else:
154
+ result: Any = input_data
155
+ for _, result in self._run_methods(input_data):
156
+ pass
157
+ return result
158
+
159
+
160
+ def _validate_workflow(self, base_type: str) -> bool:
161
+ """
162
+ Validates the workflow against the given template and type rules.
163
+
164
+ Ensures methods are used in valid steps, not repeated improperly,
165
+ and conform to chaining and inclusion/exclusion rules.
166
+
167
+ :param base_type: The starting type of the workflow.
168
+ :type base_type: str
169
+
170
+ :return: True if workflow passes validation, otherwise raises RuntimeError.
171
+ :rtype: bool
172
+
173
+ :raises ValueError: If no template is provided.
174
+ """
175
+ current_line: int = 0
176
+ current_step: int = 0
177
+ excluded_methods: dict[str, str] = {}
178
+ self.current_types.extend([base_type])
179
+
180
+ template: WorkflowTemplate | None = self.template
181
+
182
+ if template is None:
183
+ raise ValueError(f"A template was not provided for this module.")
184
+
185
+ methods: list[str] = [list(method.keys())[0] for method in self.methods]
186
+ steps: list[int] = list(template["steps"].keys())
187
+
188
+ while current_line < len(methods):
189
+ current_method: str = methods[current_line]
190
+ step_data: StepConfig = template["steps"][current_step]
191
+
192
+ for data_type in self.current_types:
193
+ if data_type in TypeRules:
194
+ if self.module_name in TypeRules[data_type]:
195
+ if TypeRules[data_type][self.module_name]["exclude"] == "*":
196
+ raise RuntimeError(f"You cannot choose the methods from {self.module_name} because you are currently processing by {self.current_types}")
197
+
198
+ if "*" in excluded_methods and not step_data["chaining"]:
199
+ raise RuntimeError(f"You cannot use method {current_method} because {excluded_methods['*']}")
200
+
201
+ if current_method in excluded_methods:
202
+ raise RuntimeError(f"You cannot use method {current_method} because {excluded_methods[current_method]}")
203
+
204
+ if "include" in step_data:
205
+ included_methods: list[str] | str = step_data["include"]
206
+
207
+ if included_methods != "*":
208
+ if current_method not in included_methods:
209
+ raise RuntimeError(f"The method on step {template['steps'][current_step]['step_name']} must be {included_methods}. Instead, got method {current_method}")
210
+
211
+ for included_method in included_methods:
212
+ excluded_methods[included_method] = f"it can ony be used on step {template['steps'][current_step]['step_name']}"
213
+ else:
214
+ excluded_methods["*"] = f"you can select only one method from this module."
215
+ else:
216
+ step_excluded_methods: dict[str, int] = step_data["exclude"]
217
+ if current_method in step_excluded_methods:
218
+ raise RuntimeError(f"You cannot use method {current_method} because it can only be used in step {template['steps'][step_excluded_methods[current_method]]['step_name']}")
219
+
220
+ if not step_data["repeat"]:
221
+ excluded_methods[current_method] = "this method cannot be used twice"
222
+
223
+ self._trigger_type(current_method, excluded_methods)
224
+
225
+ current_line += 1
226
+
227
+ if current_step < len(steps) - 1:
228
+ current_step += 1
229
+ else:
230
+ pass
231
+
232
+ return True
233
+
234
+ def _trigger_type(self, method_name: str, excluded_methods: dict[str, str]) -> None:
235
+ """
236
+ Updates the current data types and excluded methods based on the triggered method.
237
+
238
+ :param method_name: The method that triggers a type change.
239
+ :type method_name: str
240
+ :param excluded_methods: Dictionary tracking excluded methods and their reasons.
241
+ :type excluded_methods: dict[str, str]
242
+
243
+ :raises ValueError: If no template is provided.
244
+ """
245
+ template: WorkflowTemplate | None = self.template
246
+
247
+ if template is None:
248
+ raise ValueError(f"A template was not provided. Please provide a valid template")
249
+
250
+ triggers: dict[str, str] = template["triggers"]
251
+
252
+ if method_name in triggers and self.module_name:
253
+ rules: RulesTemplate = TypeRules[triggers[method_name]]
254
+
255
+ if template["base_type"] in self.current_types:
256
+ self.current_types.pop()
257
+
258
+ self.current_types.append(triggers[method_name])
259
+
260
+ if self.module_name in rules:
261
+ method_rules: list[str] | str = rules[self.module_name]["exclude"]
262
+
263
+ for method in method_rules:
264
+ excluded_methods[method] = f"you are currently processing by {self.current_types}"
265
+
266
+
267
+ class StageWorkflowBuilder:
268
+ """
269
+ Manages and runs workflows composed of multiple stages and modules.
270
+
271
+ Allows sequential execution of workflows associated with various modules/stages, validating and chaining them according to provided templates and configurations.
272
+
273
+ :param templates: A mapping of module names to their workflow templates.
274
+ :type templates: StageTemplate
275
+ :param stage_modules: A mapping of stage indices to allowed module names.
276
+ :type stage_modules: StageModules
277
+ :param name: Name of the stage workflow builder instance.
278
+ :type name: str
279
+ :param workflows: A mapping of module names to their workflows.
280
+ :type workflows: StageWorkflow
281
+ """
282
+
283
+ def __init__(self, templates: StageTemplate, stage_modules: StageModules, name: str, workflows: StageWorkflow):
284
+ """
285
+ Initializes StageWorkflowBuilder with given attributes.
286
+ """
287
+
288
+ self.name: str = name
289
+ self.stage_workflows: StageWorkflow = workflows
290
+ self.templates: StageTemplate = templates
291
+ self.stage_modules: StageModules = stage_modules
292
+ self.workflows: list[WorkflowBuilder] = []
293
+
294
+ def _run_workflow(self, input_data: Any) -> StepGenerator:
295
+ """
296
+ Executes all workflows in sequence on the input data.
297
+
298
+ :param input_data: The input data to process through the workflow.
299
+ :type input_data: Any
300
+
301
+ :return: A generator yielding StepResult instances containing workflow names and their results.
302
+ :rtype: StepGenerator
303
+ """
304
+ result: Any = input_data
305
+
306
+ for workflow in self.workflows:
307
+ result = workflow(input_data=result)
308
+
309
+ yield StepResult(name=workflow.name, result=result)
310
+
311
+
312
+ def __call__(self, input_data: Any, return_modules: list[str] | None = None, return_all: bool = False) -> ResultGenerator | StepGenerator | Any:
313
+ """
314
+ Runs the staged workflows on the input data.
315
+
316
+ Checks stage/module compatibility, validates workflows against templates, and returns results based on parameters.
317
+
318
+ :param input_data: Input data to process.
319
+ :type input_data: Any
320
+ :param return_modules: If provided, yields results only for these modules. Defaults to None
321
+ :type return_modules: list[str] or None
322
+ :param return_all: If True, yields results for all modules. Defaults to False.
323
+ :type return_all: bool
324
+
325
+ :return: Final result, or a generator of step/module results.
326
+ :rtype: ResultGenerator | StepGenerator | Any
327
+
328
+ :raises RuntimeError: If a module is used in an incorrect stage.
329
+ :raises ValueError: If any workflow is invalid or template is missing.
330
+ """
331
+
332
+ self.workflows = []
333
+
334
+ max_stage: int = max(self.stage_modules)
335
+
336
+ current_stage: int = 0
337
+
338
+ for module in self.stage_workflows:
339
+
340
+ allowed_modules: set[str] = self.stage_modules[current_stage]
341
+
342
+ if module not in allowed_modules:
343
+ raise RuntimeError(f"The module on on stage {current_stage} must be one of the following {allowed_modules}")
344
+
345
+ if current_stage < max_stage:
346
+ current_stage += 1
347
+
348
+ if not is_workflow(self.stage_workflows[module]):
349
+ raise ValueError(f"Please enter a valid workflow for module {module}")
350
+
351
+ module_workflow: WorkflowBuilder = WorkflowBuilder(
352
+ name=f"{module}",
353
+ template=self.templates[module],
354
+ module_name=module,
355
+ workflow=self.stage_workflows[module]
356
+ )
357
+
358
+ if self.workflows:
359
+ module_workflow.current_types = self.workflows[-1].current_types
360
+
361
+ template: WorkflowTemplate | None = module_workflow.template
362
+
363
+ if template is None:
364
+ raise ValueError(f"A template for module {module} not provided")
365
+
366
+ module_workflow._validate_workflow(base_type=template["base_type"])
367
+
368
+ self.workflows.append(module_workflow)
369
+
370
+ if return_modules:
371
+ return (result for name, result in self._run_workflow(input_data) if name in return_modules)
372
+ elif return_all:
373
+ return self._run_workflow(input_data)
374
+ else:
375
+ result = input_data
376
+
377
+ for _, result in self._run_workflow(input_data):
378
+ pass
379
+
380
+ return result
381
+
382
+
383
+