dstklib 1.0.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. dstk/__init__.py +10 -12
  2. dstk/adaptors/__init__.py +2 -0
  3. dstk/adaptors/adaptors.py +91 -0
  4. dstk/adaptors/typeguards.py +141 -0
  5. dstk/hooks/__init__.py +2 -0
  6. dstk/hooks/hook_tools.py +89 -0
  7. dstk/hooks/type_conversion.py +40 -0
  8. dstk/lib_types/__init__.py +2 -3
  9. dstk/lib_types/dstk_types.py +188 -16
  10. dstk/lib_types/plotly_types.py +1 -0
  11. dstk/method_index.py +32 -0
  12. dstk/models/__init__.py +2 -0
  13. dstk/models/model_tools.py +83 -0
  14. dstk/models/models.py +191 -0
  15. dstk/modules/__init__.py +10 -0
  16. dstk/modules/count_models.py +91 -0
  17. dstk/modules/data_visualization/__init__.py +2 -0
  18. dstk/modules/data_visualization/clustering.py +129 -0
  19. dstk/modules/data_visualization/embeddings.py +101 -0
  20. dstk/modules/geometric_distance.py +114 -0
  21. dstk/modules/ngrams.py +156 -0
  22. dstk/modules/predict_models.py +109 -0
  23. dstk/modules/text_matrix_builder.py +55 -0
  24. dstk/modules/text_processor.py +100 -0
  25. dstk/modules/tokenizer.py +139 -0
  26. dstk/modules/weight_matrix.py +65 -0
  27. dstk/templates/__init__.py +2 -0
  28. dstk/templates/rules.py +59 -0
  29. dstk/templates/templates.py +231 -0
  30. dstk/workflows/__init__.py +2 -0
  31. dstk/workflows/stage_workflows.py +55 -0
  32. dstk/workflows/workflow_tools.py +383 -0
  33. dstklib-2.0.0.dist-info/METADATA +377 -0
  34. dstklib-2.0.0.dist-info/RECORD +43 -0
  35. dstk/collocations.py +0 -121
  36. dstk/count_models.py +0 -112
  37. dstk/geometric_distance.py +0 -107
  38. dstk/lib_types/matplotlib_types.py +0 -4
  39. dstk/lib_types/nltk_types.py +0 -1
  40. dstk/matrix_base.py +0 -113
  41. dstk/pipeline_tools.py +0 -27
  42. dstk/pipelines.py +0 -114
  43. dstk/plot_embeddings.py +0 -240
  44. dstk/predict_models.py +0 -189
  45. dstk/text_matrix_builder.py +0 -87
  46. dstk/text_processor.py +0 -450
  47. dstk/weight_matrix.py +0 -71
  48. dstk/workflow_tools.py +0 -257
  49. dstklib-1.0.1.dist-info/METADATA +0 -360
  50. dstklib-1.0.1.dist-info/RECORD +0 -28
  51. {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/LICENSE +0 -0
  52. {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/WHEEL +0 -0
  53. {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,65 @@
1
+ """
2
+ This module provides functions to apply weighting schemes to co-occurrence matrices commonly used in natural language processing and text mining.
3
+
4
+ Available weighting methods include:
5
+
6
+ * Pointwise Mutual Information (PMI) and Positive PMI (PPMI), which measure the association strength between co-occurring terms by comparing observed co-occurrence frequencies to expected frequencies under independence.
7
+ * Term Frequency-Inverse Document Frequency (Tf-idf), which reweights term importance based on frequency patterns, leveraging sklearn's TfidfTransformer.
8
+
9
+ These weighting techniques help enhance the semantic relevance of co-occurrence matrices, improving downstream tasks such as word embedding, clustering, and semantic similarity analysis.
10
+
11
+ All functions return weighted co-occurrence matrices as Pandas DataFrames for convenient further analysis.
12
+ """
13
+
14
+ import pandas as pd
15
+ import numpy as np
16
+ from sklearn.feature_extraction.text import TfidfTransformer
17
+
18
+ from ..lib_types import DataFrame, ndarray, Series, csr_matrix
19
+
20
+
21
+ def pmi(co_matrix: DataFrame, positive: bool = False) -> DataFrame:
22
+ """
23
+ Weights a Co-occurrence matrix by PMI or PPMI.
24
+
25
+ :param co_matrix: A Co-occurrence matrix to be weighted.
26
+ :type co_matrix: DataFrame
27
+ :param positive: If True, weights the Co-ocurrence matrix by PPMI. If False, weighths it by PMI. Defaults to False.
28
+ :type positive: bool
29
+
30
+ :returns: A Co-occurrence matrix weighted by PMI or PPMI.
31
+ :rtype: DataFrame
32
+ """
33
+
34
+ df: DataFrame = co_matrix
35
+
36
+ col_totals: Series = df.sum(axis=0)
37
+ total: float = col_totals.sum()
38
+ row_totals: Series = df.sum(axis=1)
39
+ expected: ndarray = np.outer(row_totals, col_totals) / total
40
+ df = df / expected
41
+ # Silence distracting warnings about log(0):
42
+ with np.errstate(divide='ignore'):
43
+ df = np.log(df)
44
+ df[np.isinf(df)] = 0.0 # log(0) = 0
45
+ if positive:
46
+ df[df < 0] = 0.0
47
+
48
+ return df
49
+
50
+ def tf_idf(co_matrix: DataFrame, **kwargs) -> DataFrame:
51
+ """
52
+ Weights a Co-occurrence matrix by Tf-idf.
53
+
54
+ :param co_matrix: A Co-occurrence matrix to be weighted.
55
+ :type co_matrix: DataFrame
56
+ :param kwargs: Additional keyword arguments to pass to sklearn's TfidfTransformer. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
57
+
58
+ :returns: A Co-occurrence matrix weighted by Tf-idf.
59
+ :rtype: DataFrame
60
+ """
61
+
62
+ transformer: TfidfTransformer = TfidfTransformer(**kwargs)
63
+ tf_idf_matrix: csr_matrix = transformer.fit_transform(co_matrix)
64
+
65
+ return pd.DataFrame(tf_idf_matrix.toarray(), index=co_matrix.index, columns=co_matrix.columns)
@@ -0,0 +1,2 @@
1
+ from .templates import *
2
+ from .rules import *
@@ -0,0 +1,59 @@
1
+ """
2
+ Defines type-based exclusion rules that constrain which methods can be applied to data at different stages in a workflow.
3
+
4
+ Each rule maps a data type (e.g., `POSTaggedWordList`, `Sentences`, `str`, `Neighbors`) to a set of module-specific restrictions. These rules help ensure that operations are semantically valid and compatible with the current data representation, enabling type-aware validation and error handling during workflow execution.
5
+
6
+ Structure:
7
+
8
+ Each rule is a `RulesTemplate` (dict) where:
9
+
10
+ * Keys are module names (e.g., "tokenizer", "text_processor").
11
+ * Values define methods to exclude (either a list of method names or "*" for all).
12
+
13
+ The `TypeRules` dictionary aggregates all individual rules and serves as a centralized configuration for type-based behavior enforcement.
14
+
15
+ Use case:
16
+ These rules are primarily used in the validation step of workflow builders to prevent method misuse based on data type.
17
+ """
18
+
19
+ from ..lib_types import RulesTemplate
20
+
21
+ POSTaggedWordListRules: RulesTemplate = {
22
+ "tokenizer": {
23
+ "exclude": ["pos_tagger"]
24
+ },
25
+ "text_processor": {
26
+ "exclude": ["join"]
27
+ },
28
+ "ngrams": {
29
+ "exclude": "*"
30
+ }
31
+ }
32
+
33
+ SentencesRules: RulesTemplate = {
34
+ "text_processor": {
35
+ "exclude": ["get_vocabulary", "save_to_file"]
36
+ },
37
+ "ngrams": {
38
+ "exclude": "*"
39
+ }
40
+ }
41
+
42
+ StringRules: RulesTemplate = {
43
+ "text_processor": {
44
+ "exclude": ["to_lower", "get_vocabulary", "join", "save_to_file"]
45
+ }
46
+ }
47
+
48
+ NeighborsRules: RulesTemplate = {
49
+ "geometric_distance": {
50
+ "exclude": "*"
51
+ }
52
+ }
53
+
54
+ TypeRules: dict[str, RulesTemplate] = {
55
+ "POSTaggedWordList": POSTaggedWordListRules,
56
+ "Sentences": SentencesRules,
57
+ "str": StringRules,
58
+ "Neighbors": NeighborsRules
59
+ }
@@ -0,0 +1,231 @@
1
+ """
2
+ Defines reusable templates that specify the structure and constraints of workflows and step-based pipelines.
3
+
4
+ Each template outlines the allowed sequence of method steps for a given module, enforcing constraints such as:
5
+
6
+ * Which methods are permitted or excluded at each step (`include` / `exclude`)
7
+ * Whether methods can be used more than once (`repeat`)
8
+ * Whether more than one method can be selected on each step (`chaining`)
9
+ * How types are transformed via `triggers`
10
+
11
+ Key Components:
12
+
13
+ * **Workflow Templates** (`WorkflowTemplate`): Describe valid method sequences for individual modules (e.g., tokenization, text processing, dimensionality reduction).
14
+ * **Stage Templates** (`StageTemplate`): Group related modules into stages to define multi-module workflows.
15
+ * **Stage Modules** (`StageModules`): Define allowed module names for each stage in a stage-based workflow.
16
+
17
+ These templates are used by `WorkflowBuilder` and `StageWorkflowBuilder` to validate workflows and enforce correct sequencing of operations.
18
+
19
+ Examples of Defined Templates:
20
+
21
+ * `TokenizerTemplate`: Defines the tokenization workflow including model selection, unit selection (sentences/tokens), and token processing.
22
+ * `TextProcessorTemplate`: Defines generic text processing steps like lowercasing, joining, etc.
23
+ * `TextMatrixBuilderTemplate`: Specifies steps to create a document-term and co-occurrence matrix.
24
+ * `PlotEmbeddingsTemplate`: Governs how word embeddings are plotted after clustering.
25
+
26
+ The templates provide a flexible and declarative way to define what each step in a workflow is allowed to do based on processing intent and data type.
27
+ """
28
+
29
+ from ..lib_types import WorkflowTemplate, StageTemplate, StageModules
30
+
31
+ TokenizerTemplate: WorkflowTemplate = {
32
+ "steps": {
33
+ 0: {
34
+ "include": ["apply_model"],
35
+ "repeat": False,
36
+ "chaining": False,
37
+ "step_name": "select_model"
38
+ },
39
+ 1: {
40
+ "include": ["get_sentences", "get_tokens"],
41
+ "repeat": False,
42
+ "chaining": False,
43
+ "step_name": "select_processing_unit"
44
+ },
45
+ 2: {
46
+ "include": ["remove_stop_words", "alphanumeric_raw_tokenizer"],
47
+ "repeat": False,
48
+ "chaining": False,
49
+ "step_name": "tokenization"
50
+ },
51
+ 3: {
52
+ "include": "*",
53
+ "repeat": True,
54
+ "chaining": True,
55
+ "step_name": "token_processing"
56
+ }
57
+ },
58
+ "base_type": "Words", # is this necessary?
59
+ "triggers": {
60
+ "pos_tagger": "POSTaggedWordList",
61
+ "get_sentences": "Sentences"
62
+ }
63
+ }
64
+
65
+ TextProcessorTemplate: WorkflowTemplate = {
66
+ "steps": {
67
+ 0: {
68
+ "include": "*",
69
+ "repeat": False,
70
+ "chaining": True,
71
+ "step_name": "text_processing"
72
+ }
73
+ },
74
+ "base_type": "Words",
75
+ "triggers": {}
76
+ }
77
+
78
+ NgramsTemplate: WorkflowTemplate = {
79
+ "steps": {
80
+ 0: {
81
+ "exclude": {"count_collocates": 1},
82
+ "repeat": False,
83
+ "chaining": False,
84
+ "step_name": "find_ngrams"
85
+ },
86
+ 1: {
87
+ "include": ["count_collocates"],
88
+ "repeat": False,
89
+ "chaining": False,
90
+ "step_name": "count_collocates"
91
+ }
92
+ },
93
+ "base_type": "Words",
94
+ "triggers": {}
95
+ }
96
+
97
+ TextMatrixBuilderTemplate: WorkflowTemplate = {
98
+ "steps": {
99
+ 0: {
100
+ "include": ["create_dtm"],
101
+ "repeat": False,
102
+ "chaining": True,
103
+ "step_name": "document_term_matrix"
104
+ },
105
+ 1: {
106
+ "include": ["create_co_occurrence_matrix"],
107
+ "repeat": False,
108
+ "chaining": False,
109
+ "step_name": "co_occurrence_matrix"
110
+ }
111
+ },
112
+ "base_type": "DataFrame",
113
+ "triggers": {}
114
+ }
115
+
116
+ WeightMatrixTemplate: WorkflowTemplate = {
117
+ "steps": {
118
+ 0: {
119
+ "include": "*",
120
+ "repeat": False,
121
+ "chaining": False,
122
+ "step_name": "weight_matrix"
123
+ }
124
+ },
125
+ "base_type": "DataFrame",
126
+ "triggers": {}
127
+ }
128
+
129
+ CountModelsTemplate: WorkflowTemplate = {
130
+ "steps": {
131
+ 0: {
132
+ "include": ["scale_matrix"],
133
+ "repeat": False,
134
+ "chaining": False,
135
+ "step_name": "scale_matrix"
136
+ },
137
+ 1: {
138
+ "include": "*",
139
+ "repeat": False,
140
+ "chaining": False,
141
+ "step_name": "dimensionality_reduction"
142
+ }
143
+ },
144
+ "base_type": "DataFrame",
145
+ "triggers": {}
146
+ }
147
+
148
+ GeometricDistanceTemplate: WorkflowTemplate = {
149
+ "steps": {
150
+ 0: {
151
+ "include": "*",
152
+ "repeat": False,
153
+ "chaining": False,
154
+ "step_name": "geometric_distance"
155
+ }
156
+ },
157
+ "base_type": "float",
158
+ "triggers": {
159
+ "nearest_neighbors": "Neighbors"
160
+ }
161
+ }
162
+
163
+ PredictModelsTemplate: WorkflowTemplate = {
164
+ "steps": {
165
+ 0: {
166
+ "exclude": {"save_model": 1},
167
+ "repeat": False,
168
+ "chaining": False,
169
+ "step_name": "select_model"
170
+ },
171
+ 1: {
172
+ "include": ["save_model"],
173
+ "repeat": False,
174
+ "chaining": False,
175
+ "step_name": "save_model"
176
+ }
177
+ },
178
+ "base_type": "NeuralModels",
179
+ "triggers": {
180
+ "save_model": "str"
181
+ }
182
+ }
183
+
184
+ ClusteringTemplate: WorkflowTemplate = {
185
+ "steps": {
186
+ 0: {
187
+ "include": "*",
188
+ "repeat": False,
189
+ "chaining": False,
190
+ "step_name": "clustering"
191
+ }
192
+ },
193
+ "base_type": "DataFrame",
194
+ "triggers": {}
195
+ }
196
+
197
+ PlotEmbeddingsTemplate: WorkflowTemplate = {
198
+ "steps": {
199
+ 0: {
200
+ "include": "*",
201
+ "repeat": False,
202
+ "chaining": False,
203
+ "step_name": "embedings_plot"
204
+ }
205
+ },
206
+ "base_type": "Figure",
207
+ "triggers": {}
208
+ }
209
+
210
+ # Stage templates
211
+
212
+ TextProcessingTemplates: StageTemplate = {
213
+ "tokenizer": TokenizerTemplate,
214
+ "text_processor": TextProcessorTemplate,
215
+ "ngrams": NgramsTemplate
216
+ }
217
+
218
+ TextProcessingStageModules: StageModules = {
219
+ 0: {"tokenizer"},
220
+ 1: {"text_processor", "ngrams"}
221
+ }
222
+
223
+ PlotEmbeddingsTemplates: StageTemplate = {
224
+ "data_visualization.clustering": ClusteringTemplate,
225
+ "data_visualization.embeddings": PlotEmbeddingsTemplate,
226
+ }
227
+
228
+ PlotEmbeddingsStageModules: StageModules = {
229
+ 0: {"data_visualization.clustering"},
230
+ 1: {"data_visualization.embeddings"}
231
+ }
@@ -0,0 +1,2 @@
1
+ from .workflow_tools import *
2
+ from .stage_workflows import *
@@ -0,0 +1,55 @@
1
+ """
2
+ This module provides classes and factory functions to build and manage complex, multi-stage workflows composed of sequential method executions across different processing modules. It supports workflow validation against predefined templates, method chaining with type enforcement, and flexible execution control, including partial or complete result retrieval.
3
+
4
+ Key components include:
5
+
6
+ * Factory functions like TextProcessing and PlotEmbeddings to easily instantiate common workflows with predefined templates and modules.
7
+
8
+ Designed to facilitate modular, extensible, and maintainable workflow construction for tasks such as text processing and embedding visualization.
9
+ """
10
+
11
+ from .workflow_tools import StageWorkflowBuilder
12
+ from ..templates import TextProcessingTemplates, TextProcessingStageModules, PlotEmbeddingsTemplates, PlotEmbeddingsStageModules
13
+ from ..lib_types import StageWorkflow
14
+
15
+ def TextProcessing(name: str, workflows: StageWorkflow) -> StageWorkflowBuilder:
16
+ """
17
+ Creates a StageWorkflowBuilder configured for text processing workflows. The modules included are 'tokenizer' in the first stage and 'text_processor' or 'ngrams' in the second.
18
+
19
+ :param name: The name of the workflow instance.
20
+ :type name: str
21
+ :param workflows: A StageWorkflow dictionary defining the workflow steps per module/stage.
22
+ :type workflows: StageWorkflow
23
+
24
+ :return: An instance of StageWorkflowBuilder configured with text processing templates and modules.
25
+ :rtype: StageWorkflowBuilder
26
+ """
27
+ TextProcessingWorkflow = StageWorkflowBuilder(
28
+ templates=TextProcessingTemplates,
29
+ stage_modules=TextProcessingStageModules,
30
+ name=name,
31
+ workflows=workflows
32
+ )
33
+
34
+ return TextProcessingWorkflow
35
+
36
+ def PlotEmbeddings(name: str, workflows: StageWorkflow) -> StageWorkflowBuilder:
37
+ """
38
+ Creates a StageWorkflowBuilder configured for word embedding plotting workflows. The modules included are 'data_visualization.clustering' in the first stage and 'data_visualization.embeddings' in the second.
39
+
40
+ :param name: The name of the workflow instance.
41
+ :type name: str
42
+ :param workflows: A StageWorkflow dictionary defining the workflow steps per module/stage.
43
+ :type workflows: StageWorkflow
44
+
45
+ :return: An instance of StageWorkflowBuilder configured with embedding plotting templates and modules.
46
+ :rtype: StageWorkflowBuilder
47
+ """
48
+ PlotEmbeddingsWorkflow = StageWorkflowBuilder(
49
+ templates=PlotEmbeddingsTemplates,
50
+ stage_modules=PlotEmbeddingsStageModules,
51
+ name=name,
52
+ workflows=workflows
53
+ )
54
+
55
+ return PlotEmbeddingsWorkflow