dstklib 1.0.2__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstk/__init__.py +10 -12
- dstk/adaptors/__init__.py +2 -0
- dstk/adaptors/adaptors.py +91 -0
- dstk/adaptors/typeguards.py +141 -0
- dstk/hooks/__init__.py +2 -0
- dstk/hooks/hook_tools.py +89 -0
- dstk/hooks/type_conversion.py +40 -0
- dstk/lib_types/__init__.py +2 -3
- dstk/lib_types/dstk_types.py +188 -16
- dstk/lib_types/plotly_types.py +1 -0
- dstk/method_index.py +32 -0
- dstk/models/__init__.py +2 -0
- dstk/models/model_tools.py +83 -0
- dstk/models/models.py +191 -0
- dstk/modules/__init__.py +10 -0
- dstk/modules/count_models.py +91 -0
- dstk/modules/data_visualization/__init__.py +2 -0
- dstk/modules/data_visualization/clustering.py +129 -0
- dstk/modules/data_visualization/embeddings.py +101 -0
- dstk/modules/geometric_distance.py +114 -0
- dstk/modules/ngrams.py +156 -0
- dstk/modules/predict_models.py +109 -0
- dstk/modules/text_matrix_builder.py +55 -0
- dstk/modules/text_processor.py +100 -0
- dstk/modules/tokenizer.py +139 -0
- dstk/modules/weight_matrix.py +65 -0
- dstk/templates/__init__.py +2 -0
- dstk/templates/rules.py +59 -0
- dstk/templates/templates.py +231 -0
- dstk/workflows/__init__.py +2 -0
- dstk/workflows/stage_workflows.py +55 -0
- dstk/workflows/workflow_tools.py +383 -0
- dstklib-2.0.0.dist-info/METADATA +377 -0
- dstklib-2.0.0.dist-info/RECORD +43 -0
- dstk/collocations.py +0 -121
- dstk/count_models.py +0 -112
- dstk/geometric_distance.py +0 -107
- dstk/lib_types/matplotlib_types.py +0 -4
- dstk/lib_types/nltk_types.py +0 -1
- dstk/matrix_base.py +0 -113
- dstk/pipeline_tools.py +0 -27
- dstk/pipelines.py +0 -114
- dstk/plot_embeddings.py +0 -240
- dstk/predict_models.py +0 -189
- dstk/text_matrix_builder.py +0 -87
- dstk/text_processor.py +0 -450
- dstk/weight_matrix.py +0 -71
- dstk/workflow_tools.py +0 -257
- dstklib-1.0.2.dist-info/METADATA +0 -369
- dstklib-1.0.2.dist-info/RECORD +0 -28
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/LICENSE +0 -0
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/WHEEL +0 -0
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
"""
|
2
|
+
This module provides functions to apply weighting schemes to co-occurrence matrices commonly used in natural language processing and text mining.
|
3
|
+
|
4
|
+
Available weighting methods include:
|
5
|
+
|
6
|
+
* Pointwise Mutual Information (PMI) and Positive PMI (PPMI), which measure the association strength between co-occurring terms by comparing observed co-occurrence frequencies to expected frequencies under independence.
|
7
|
+
* Term Frequency-Inverse Document Frequency (Tf-idf), which reweights term importance based on frequency patterns, leveraging sklearn's TfidfTransformer.
|
8
|
+
|
9
|
+
These weighting techniques help enhance the semantic relevance of co-occurrence matrices, improving downstream tasks such as word embedding, clustering, and semantic similarity analysis.
|
10
|
+
|
11
|
+
All functions return weighted co-occurrence matrices as Pandas DataFrames for convenient further analysis.
|
12
|
+
"""
|
13
|
+
|
14
|
+
import pandas as pd
|
15
|
+
import numpy as np
|
16
|
+
from sklearn.feature_extraction.text import TfidfTransformer
|
17
|
+
|
18
|
+
from ..lib_types import DataFrame, ndarray, Series, csr_matrix
|
19
|
+
|
20
|
+
|
21
|
+
def pmi(co_matrix: DataFrame, positive: bool = False) -> DataFrame:
|
22
|
+
"""
|
23
|
+
Weights a Co-occurrence matrix by PMI or PPMI.
|
24
|
+
|
25
|
+
:param co_matrix: A Co-occurrence matrix to be weighted.
|
26
|
+
:type co_matrix: DataFrame
|
27
|
+
:param positive: If True, weights the Co-ocurrence matrix by PPMI. If False, weighths it by PMI. Defaults to False.
|
28
|
+
:type positive: bool
|
29
|
+
|
30
|
+
:returns: A Co-occurrence matrix weighted by PMI or PPMI.
|
31
|
+
:rtype: DataFrame
|
32
|
+
"""
|
33
|
+
|
34
|
+
df: DataFrame = co_matrix
|
35
|
+
|
36
|
+
col_totals: Series = df.sum(axis=0)
|
37
|
+
total: float = col_totals.sum()
|
38
|
+
row_totals: Series = df.sum(axis=1)
|
39
|
+
expected: ndarray = np.outer(row_totals, col_totals) / total
|
40
|
+
df = df / expected
|
41
|
+
# Silence distracting warnings about log(0):
|
42
|
+
with np.errstate(divide='ignore'):
|
43
|
+
df = np.log(df)
|
44
|
+
df[np.isinf(df)] = 0.0 # log(0) = 0
|
45
|
+
if positive:
|
46
|
+
df[df < 0] = 0.0
|
47
|
+
|
48
|
+
return df
|
49
|
+
|
50
|
+
def tf_idf(co_matrix: DataFrame, **kwargs) -> DataFrame:
|
51
|
+
"""
|
52
|
+
Weights a Co-occurrence matrix by Tf-idf.
|
53
|
+
|
54
|
+
:param co_matrix: A Co-occurrence matrix to be weighted.
|
55
|
+
:type co_matrix: DataFrame
|
56
|
+
:param kwargs: Additional keyword arguments to pass to sklearn's TfidfTransformer. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
|
57
|
+
|
58
|
+
:returns: A Co-occurrence matrix weighted by Tf-idf.
|
59
|
+
:rtype: DataFrame
|
60
|
+
"""
|
61
|
+
|
62
|
+
transformer: TfidfTransformer = TfidfTransformer(**kwargs)
|
63
|
+
tf_idf_matrix: csr_matrix = transformer.fit_transform(co_matrix)
|
64
|
+
|
65
|
+
return pd.DataFrame(tf_idf_matrix.toarray(), index=co_matrix.index, columns=co_matrix.columns)
|
dstk/templates/rules.py
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
"""
|
2
|
+
Defines type-based exclusion rules that constrain which methods can be applied to data at different stages in a workflow.
|
3
|
+
|
4
|
+
Each rule maps a data type (e.g., `POSTaggedWordList`, `Sentences`, `str`, `Neighbors`) to a set of module-specific restrictions. These rules help ensure that operations are semantically valid and compatible with the current data representation, enabling type-aware validation and error handling during workflow execution.
|
5
|
+
|
6
|
+
Structure:
|
7
|
+
|
8
|
+
Each rule is a `RulesTemplate` (dict) where:
|
9
|
+
|
10
|
+
* Keys are module names (e.g., "tokenizer", "text_processor").
|
11
|
+
* Values define methods to exclude (either a list of method names or "*" for all).
|
12
|
+
|
13
|
+
The `TypeRules` dictionary aggregates all individual rules and serves as a centralized configuration for type-based behavior enforcement.
|
14
|
+
|
15
|
+
Use case:
|
16
|
+
These rules are primarily used in the validation step of workflow builders to prevent method misuse based on data type.
|
17
|
+
"""
|
18
|
+
|
19
|
+
from ..lib_types import RulesTemplate
|
20
|
+
|
21
|
+
POSTaggedWordListRules: RulesTemplate = {
|
22
|
+
"tokenizer": {
|
23
|
+
"exclude": ["pos_tagger"]
|
24
|
+
},
|
25
|
+
"text_processor": {
|
26
|
+
"exclude": ["join"]
|
27
|
+
},
|
28
|
+
"ngrams": {
|
29
|
+
"exclude": "*"
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
SentencesRules: RulesTemplate = {
|
34
|
+
"text_processor": {
|
35
|
+
"exclude": ["get_vocabulary", "save_to_file"]
|
36
|
+
},
|
37
|
+
"ngrams": {
|
38
|
+
"exclude": "*"
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
StringRules: RulesTemplate = {
|
43
|
+
"text_processor": {
|
44
|
+
"exclude": ["to_lower", "get_vocabulary", "join", "save_to_file"]
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
NeighborsRules: RulesTemplate = {
|
49
|
+
"geometric_distance": {
|
50
|
+
"exclude": "*"
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
TypeRules: dict[str, RulesTemplate] = {
|
55
|
+
"POSTaggedWordList": POSTaggedWordListRules,
|
56
|
+
"Sentences": SentencesRules,
|
57
|
+
"str": StringRules,
|
58
|
+
"Neighbors": NeighborsRules
|
59
|
+
}
|
@@ -0,0 +1,231 @@
|
|
1
|
+
"""
|
2
|
+
Defines reusable templates that specify the structure and constraints of workflows and step-based pipelines.
|
3
|
+
|
4
|
+
Each template outlines the allowed sequence of method steps for a given module, enforcing constraints such as:
|
5
|
+
|
6
|
+
* Which methods are permitted or excluded at each step (`include` / `exclude`)
|
7
|
+
* Whether methods can be used more than once (`repeat`)
|
8
|
+
* Whether more than one method can be selected on each step (`chaining`)
|
9
|
+
* How types are transformed via `triggers`
|
10
|
+
|
11
|
+
Key Components:
|
12
|
+
|
13
|
+
* **Workflow Templates** (`WorkflowTemplate`): Describe valid method sequences for individual modules (e.g., tokenization, text processing, dimensionality reduction).
|
14
|
+
* **Stage Templates** (`StageTemplate`): Group related modules into stages to define multi-module workflows.
|
15
|
+
* **Stage Modules** (`StageModules`): Define allowed module names for each stage in a stage-based workflow.
|
16
|
+
|
17
|
+
These templates are used by `WorkflowBuilder` and `StageWorkflowBuilder` to validate workflows and enforce correct sequencing of operations.
|
18
|
+
|
19
|
+
Examples of Defined Templates:
|
20
|
+
|
21
|
+
* `TokenizerTemplate`: Defines the tokenization workflow including model selection, unit selection (sentences/tokens), and token processing.
|
22
|
+
* `TextProcessorTemplate`: Defines generic text processing steps like lowercasing, joining, etc.
|
23
|
+
* `TextMatrixBuilderTemplate`: Specifies steps to create a document-term and co-occurrence matrix.
|
24
|
+
* `PlotEmbeddingsTemplate`: Governs how word embeddings are plotted after clustering.
|
25
|
+
|
26
|
+
The templates provide a flexible and declarative way to define what each step in a workflow is allowed to do based on processing intent and data type.
|
27
|
+
"""
|
28
|
+
|
29
|
+
from ..lib_types import WorkflowTemplate, StageTemplate, StageModules
|
30
|
+
|
31
|
+
TokenizerTemplate: WorkflowTemplate = {
|
32
|
+
"steps": {
|
33
|
+
0: {
|
34
|
+
"include": ["apply_model"],
|
35
|
+
"repeat": False,
|
36
|
+
"chaining": False,
|
37
|
+
"step_name": "select_model"
|
38
|
+
},
|
39
|
+
1: {
|
40
|
+
"include": ["get_sentences", "get_tokens"],
|
41
|
+
"repeat": False,
|
42
|
+
"chaining": False,
|
43
|
+
"step_name": "select_processing_unit"
|
44
|
+
},
|
45
|
+
2: {
|
46
|
+
"include": ["remove_stop_words", "alphanumeric_raw_tokenizer"],
|
47
|
+
"repeat": False,
|
48
|
+
"chaining": False,
|
49
|
+
"step_name": "tokenization"
|
50
|
+
},
|
51
|
+
3: {
|
52
|
+
"include": "*",
|
53
|
+
"repeat": True,
|
54
|
+
"chaining": True,
|
55
|
+
"step_name": "token_processing"
|
56
|
+
}
|
57
|
+
},
|
58
|
+
"base_type": "Words", # is this necessary?
|
59
|
+
"triggers": {
|
60
|
+
"pos_tagger": "POSTaggedWordList",
|
61
|
+
"get_sentences": "Sentences"
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
TextProcessorTemplate: WorkflowTemplate = {
|
66
|
+
"steps": {
|
67
|
+
0: {
|
68
|
+
"include": "*",
|
69
|
+
"repeat": False,
|
70
|
+
"chaining": True,
|
71
|
+
"step_name": "text_processing"
|
72
|
+
}
|
73
|
+
},
|
74
|
+
"base_type": "Words",
|
75
|
+
"triggers": {}
|
76
|
+
}
|
77
|
+
|
78
|
+
NgramsTemplate: WorkflowTemplate = {
|
79
|
+
"steps": {
|
80
|
+
0: {
|
81
|
+
"exclude": {"count_collocates": 1},
|
82
|
+
"repeat": False,
|
83
|
+
"chaining": False,
|
84
|
+
"step_name": "find_ngrams"
|
85
|
+
},
|
86
|
+
1: {
|
87
|
+
"include": ["count_collocates"],
|
88
|
+
"repeat": False,
|
89
|
+
"chaining": False,
|
90
|
+
"step_name": "count_collocates"
|
91
|
+
}
|
92
|
+
},
|
93
|
+
"base_type": "Words",
|
94
|
+
"triggers": {}
|
95
|
+
}
|
96
|
+
|
97
|
+
TextMatrixBuilderTemplate: WorkflowTemplate = {
|
98
|
+
"steps": {
|
99
|
+
0: {
|
100
|
+
"include": ["create_dtm"],
|
101
|
+
"repeat": False,
|
102
|
+
"chaining": True,
|
103
|
+
"step_name": "document_term_matrix"
|
104
|
+
},
|
105
|
+
1: {
|
106
|
+
"include": ["create_co_occurrence_matrix"],
|
107
|
+
"repeat": False,
|
108
|
+
"chaining": False,
|
109
|
+
"step_name": "co_occurrence_matrix"
|
110
|
+
}
|
111
|
+
},
|
112
|
+
"base_type": "DataFrame",
|
113
|
+
"triggers": {}
|
114
|
+
}
|
115
|
+
|
116
|
+
WeightMatrixTemplate: WorkflowTemplate = {
|
117
|
+
"steps": {
|
118
|
+
0: {
|
119
|
+
"include": "*",
|
120
|
+
"repeat": False,
|
121
|
+
"chaining": False,
|
122
|
+
"step_name": "weight_matrix"
|
123
|
+
}
|
124
|
+
},
|
125
|
+
"base_type": "DataFrame",
|
126
|
+
"triggers": {}
|
127
|
+
}
|
128
|
+
|
129
|
+
CountModelsTemplate: WorkflowTemplate = {
|
130
|
+
"steps": {
|
131
|
+
0: {
|
132
|
+
"include": ["scale_matrix"],
|
133
|
+
"repeat": False,
|
134
|
+
"chaining": False,
|
135
|
+
"step_name": "scale_matrix"
|
136
|
+
},
|
137
|
+
1: {
|
138
|
+
"include": "*",
|
139
|
+
"repeat": False,
|
140
|
+
"chaining": False,
|
141
|
+
"step_name": "dimensionality_reduction"
|
142
|
+
}
|
143
|
+
},
|
144
|
+
"base_type": "DataFrame",
|
145
|
+
"triggers": {}
|
146
|
+
}
|
147
|
+
|
148
|
+
GeometricDistanceTemplate: WorkflowTemplate = {
|
149
|
+
"steps": {
|
150
|
+
0: {
|
151
|
+
"include": "*",
|
152
|
+
"repeat": False,
|
153
|
+
"chaining": False,
|
154
|
+
"step_name": "geometric_distance"
|
155
|
+
}
|
156
|
+
},
|
157
|
+
"base_type": "float",
|
158
|
+
"triggers": {
|
159
|
+
"nearest_neighbors": "Neighbors"
|
160
|
+
}
|
161
|
+
}
|
162
|
+
|
163
|
+
PredictModelsTemplate: WorkflowTemplate = {
|
164
|
+
"steps": {
|
165
|
+
0: {
|
166
|
+
"exclude": {"save_model": 1},
|
167
|
+
"repeat": False,
|
168
|
+
"chaining": False,
|
169
|
+
"step_name": "select_model"
|
170
|
+
},
|
171
|
+
1: {
|
172
|
+
"include": ["save_model"],
|
173
|
+
"repeat": False,
|
174
|
+
"chaining": False,
|
175
|
+
"step_name": "save_model"
|
176
|
+
}
|
177
|
+
},
|
178
|
+
"base_type": "NeuralModels",
|
179
|
+
"triggers": {
|
180
|
+
"save_model": "str"
|
181
|
+
}
|
182
|
+
}
|
183
|
+
|
184
|
+
ClusteringTemplate: WorkflowTemplate = {
|
185
|
+
"steps": {
|
186
|
+
0: {
|
187
|
+
"include": "*",
|
188
|
+
"repeat": False,
|
189
|
+
"chaining": False,
|
190
|
+
"step_name": "clustering"
|
191
|
+
}
|
192
|
+
},
|
193
|
+
"base_type": "DataFrame",
|
194
|
+
"triggers": {}
|
195
|
+
}
|
196
|
+
|
197
|
+
PlotEmbeddingsTemplate: WorkflowTemplate = {
|
198
|
+
"steps": {
|
199
|
+
0: {
|
200
|
+
"include": "*",
|
201
|
+
"repeat": False,
|
202
|
+
"chaining": False,
|
203
|
+
"step_name": "embedings_plot"
|
204
|
+
}
|
205
|
+
},
|
206
|
+
"base_type": "Figure",
|
207
|
+
"triggers": {}
|
208
|
+
}
|
209
|
+
|
210
|
+
# Stage templates
|
211
|
+
|
212
|
+
TextProcessingTemplates: StageTemplate = {
|
213
|
+
"tokenizer": TokenizerTemplate,
|
214
|
+
"text_processor": TextProcessorTemplate,
|
215
|
+
"ngrams": NgramsTemplate
|
216
|
+
}
|
217
|
+
|
218
|
+
TextProcessingStageModules: StageModules = {
|
219
|
+
0: {"tokenizer"},
|
220
|
+
1: {"text_processor", "ngrams"}
|
221
|
+
}
|
222
|
+
|
223
|
+
PlotEmbeddingsTemplates: StageTemplate = {
|
224
|
+
"data_visualization.clustering": ClusteringTemplate,
|
225
|
+
"data_visualization.embeddings": PlotEmbeddingsTemplate,
|
226
|
+
}
|
227
|
+
|
228
|
+
PlotEmbeddingsStageModules: StageModules = {
|
229
|
+
0: {"data_visualization.clustering"},
|
230
|
+
1: {"data_visualization.embeddings"}
|
231
|
+
}
|
@@ -0,0 +1,55 @@
|
|
1
|
+
"""
|
2
|
+
This module provides classes and factory functions to build and manage complex, multi-stage workflows composed of sequential method executions across different processing modules. It supports workflow validation against predefined templates, method chaining with type enforcement, and flexible execution control, including partial or complete result retrieval.
|
3
|
+
|
4
|
+
Key components include:
|
5
|
+
|
6
|
+
* Factory functions like TextProcessing and PlotEmbeddings to easily instantiate common workflows with predefined templates and modules.
|
7
|
+
|
8
|
+
Designed to facilitate modular, extensible, and maintainable workflow construction for tasks such as text processing and embedding visualization.
|
9
|
+
"""
|
10
|
+
|
11
|
+
from .workflow_tools import StageWorkflowBuilder
|
12
|
+
from ..templates import TextProcessingTemplates, TextProcessingStageModules, PlotEmbeddingsTemplates, PlotEmbeddingsStageModules
|
13
|
+
from ..lib_types import StageWorkflow
|
14
|
+
|
15
|
+
def TextProcessing(name: str, workflows: StageWorkflow) -> StageWorkflowBuilder:
|
16
|
+
"""
|
17
|
+
Creates a StageWorkflowBuilder configured for text processing workflows. The modules included are 'tokenizer' in the first stage and 'text_processor' or 'ngrams' in the second.
|
18
|
+
|
19
|
+
:param name: The name of the workflow instance.
|
20
|
+
:type name: str
|
21
|
+
:param workflows: A StageWorkflow dictionary defining the workflow steps per module/stage.
|
22
|
+
:type workflows: StageWorkflow
|
23
|
+
|
24
|
+
:return: An instance of StageWorkflowBuilder configured with text processing templates and modules.
|
25
|
+
:rtype: StageWorkflowBuilder
|
26
|
+
"""
|
27
|
+
TextProcessingWorkflow = StageWorkflowBuilder(
|
28
|
+
templates=TextProcessingTemplates,
|
29
|
+
stage_modules=TextProcessingStageModules,
|
30
|
+
name=name,
|
31
|
+
workflows=workflows
|
32
|
+
)
|
33
|
+
|
34
|
+
return TextProcessingWorkflow
|
35
|
+
|
36
|
+
def PlotEmbeddings(name: str, workflows: StageWorkflow) -> StageWorkflowBuilder:
|
37
|
+
"""
|
38
|
+
Creates a StageWorkflowBuilder configured for word embedding plotting workflows. The modules included are 'data_visualization.clustering' in the first stage and 'data_visualization.embeddings' in the second.
|
39
|
+
|
40
|
+
:param name: The name of the workflow instance.
|
41
|
+
:type name: str
|
42
|
+
:param workflows: A StageWorkflow dictionary defining the workflow steps per module/stage.
|
43
|
+
:type workflows: StageWorkflow
|
44
|
+
|
45
|
+
:return: An instance of StageWorkflowBuilder configured with embedding plotting templates and modules.
|
46
|
+
:rtype: StageWorkflowBuilder
|
47
|
+
"""
|
48
|
+
PlotEmbeddingsWorkflow = StageWorkflowBuilder(
|
49
|
+
templates=PlotEmbeddingsTemplates,
|
50
|
+
stage_modules=PlotEmbeddingsStageModules,
|
51
|
+
name=name,
|
52
|
+
workflows=workflows
|
53
|
+
)
|
54
|
+
|
55
|
+
return PlotEmbeddingsWorkflow
|