dstklib 1.0.2__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstk/__init__.py +10 -12
- dstk/adaptors/__init__.py +2 -0
- dstk/adaptors/adaptors.py +91 -0
- dstk/adaptors/typeguards.py +141 -0
- dstk/hooks/__init__.py +2 -0
- dstk/hooks/hook_tools.py +89 -0
- dstk/hooks/type_conversion.py +40 -0
- dstk/lib_types/__init__.py +2 -3
- dstk/lib_types/dstk_types.py +188 -16
- dstk/lib_types/plotly_types.py +1 -0
- dstk/method_index.py +32 -0
- dstk/models/__init__.py +2 -0
- dstk/models/model_tools.py +83 -0
- dstk/models/models.py +191 -0
- dstk/modules/__init__.py +10 -0
- dstk/modules/count_models.py +91 -0
- dstk/modules/data_visualization/__init__.py +2 -0
- dstk/modules/data_visualization/clustering.py +129 -0
- dstk/modules/data_visualization/embeddings.py +101 -0
- dstk/modules/geometric_distance.py +114 -0
- dstk/modules/ngrams.py +156 -0
- dstk/modules/predict_models.py +109 -0
- dstk/modules/text_matrix_builder.py +55 -0
- dstk/modules/text_processor.py +100 -0
- dstk/modules/tokenizer.py +139 -0
- dstk/modules/weight_matrix.py +65 -0
- dstk/templates/__init__.py +2 -0
- dstk/templates/rules.py +59 -0
- dstk/templates/templates.py +231 -0
- dstk/workflows/__init__.py +2 -0
- dstk/workflows/stage_workflows.py +55 -0
- dstk/workflows/workflow_tools.py +383 -0
- dstklib-2.0.0.dist-info/METADATA +377 -0
- dstklib-2.0.0.dist-info/RECORD +43 -0
- dstk/collocations.py +0 -121
- dstk/count_models.py +0 -112
- dstk/geometric_distance.py +0 -107
- dstk/lib_types/matplotlib_types.py +0 -4
- dstk/lib_types/nltk_types.py +0 -1
- dstk/matrix_base.py +0 -113
- dstk/pipeline_tools.py +0 -27
- dstk/pipelines.py +0 -114
- dstk/plot_embeddings.py +0 -240
- dstk/predict_models.py +0 -189
- dstk/text_matrix_builder.py +0 -87
- dstk/text_processor.py +0 -450
- dstk/weight_matrix.py +0 -71
- dstk/workflow_tools.py +0 -257
- dstklib-1.0.2.dist-info/METADATA +0 -369
- dstklib-1.0.2.dist-info/RECORD +0 -28
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/LICENSE +0 -0
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/WHEEL +0 -0
- {dstklib-1.0.2.dist-info → dstklib-2.0.0.dist-info}/top_level.txt +0 -0
dstk/__init__.py
CHANGED
@@ -1,12 +1,10 @@
|
|
1
|
-
from .
|
2
|
-
from .
|
3
|
-
from .
|
4
|
-
|
5
|
-
from .
|
6
|
-
from .
|
7
|
-
from .
|
8
|
-
|
9
|
-
from .
|
10
|
-
|
11
|
-
from .pipeline_tools import *
|
12
|
-
from .pipelines import *
|
1
|
+
from .modules import *
|
2
|
+
from .workflows import *
|
3
|
+
from .models import *
|
4
|
+
|
5
|
+
from .templates import *
|
6
|
+
from .hooks import *
|
7
|
+
from .adaptors import *
|
8
|
+
|
9
|
+
from .lib_types import *
|
10
|
+
|
@@ -0,0 +1,91 @@
|
|
1
|
+
"""
|
2
|
+
This module provides function decorators that adapt the input types of processing functions to improve flexibility and composability across workflows.
|
3
|
+
|
4
|
+
Specifically, it includes:
|
5
|
+
|
6
|
+
* `accepts_sentences_and_collocates`: Allows a function to seamlessly handle both individual token sequences and lists of such sequences (e.g., sentences or collocate groups).
|
7
|
+
* `accepts_tags`: Allows functions designed for plain tokens to accept and return POS-tagged inputs (POSTaggedWord), preserving tag alignment.
|
8
|
+
|
9
|
+
These adaptors make it easier to integrate diverse data types into a unified processing pipeline without requiring duplication of logic.
|
10
|
+
"""
|
11
|
+
|
12
|
+
from functools import wraps
|
13
|
+
import inspect
|
14
|
+
from inspect import BoundArguments
|
15
|
+
from .typeguards import is_sentence, is_collocates, is_pos_tags
|
16
|
+
|
17
|
+
from typing import TypeVar, Any, Callable, Sized, Iterable, cast
|
18
|
+
from ..lib_types import Token, Sentences, POSTaggedWordList, POSTaggedWord
|
19
|
+
|
20
|
+
T = TypeVar("T", bound=Sized)
|
21
|
+
|
22
|
+
def accepts_sentences_and_collocates(method: Callable[..., T]) -> Callable[..., list[T] | T]:
|
23
|
+
"""
|
24
|
+
Decorator that allows a function to accept either a single input (e.g., a list of tokens or collocates) or a list of such inputs (e.g., sentences or collocate groups). If a list of inputs is passed, the
|
25
|
+
function is applied to each element in the list, and a list of results is returned.
|
26
|
+
|
27
|
+
If the input is not a list of sentences or collocates, the function is applied normally.
|
28
|
+
|
29
|
+
:param method: The function to wrap.
|
30
|
+
:type method: Callable[..., T]
|
31
|
+
|
32
|
+
:return: A wrapped function that handles both single and batched inputs.
|
33
|
+
:rtype: Callable[..., list[T] | T]
|
34
|
+
"""
|
35
|
+
|
36
|
+
@wraps(method)
|
37
|
+
def wrapper(*args: Any, **kwargs: Any) -> list[T] | T:
|
38
|
+
signature = inspect.signature(method)
|
39
|
+
bound_args: BoundArguments = signature.bind(*args, **kwargs)
|
40
|
+
bound_args.apply_defaults()
|
41
|
+
|
42
|
+
tokens = next(iter(bound_args.arguments.values()), None)
|
43
|
+
|
44
|
+
if is_sentence(tokens) or is_collocates(tokens):
|
45
|
+
processed_sentences = [method(sentence, **kwargs) for sentence in tokens]
|
46
|
+
|
47
|
+
return [sentence for sentence in processed_sentences if sentence]
|
48
|
+
else:
|
49
|
+
return method(tokens, **kwargs)
|
50
|
+
return wrapper
|
51
|
+
|
52
|
+
|
53
|
+
def accepts_tags(method: Callable[..., T]) -> Callable[..., POSTaggedWordList | T]:
|
54
|
+
"""
|
55
|
+
Decorator that allows a function designed to operate on plain tokens to also handle POS-tagged word inputs (i.e., sequences of POSTaggedWord).
|
56
|
+
|
57
|
+
The function will automatically extract the token part, apply the method, and then re-attach the POS tags to the result. If the number of returned tokens does not match the original length, the POS tags are inferred from a lowercase mapping of the original words.
|
58
|
+
|
59
|
+
:param method: The function to wrap.
|
60
|
+
:type method: Callable[..., T]
|
61
|
+
|
62
|
+
:return: A wrapped function that processes POS-tagged input and returns a POSTaggedWordList.
|
63
|
+
:rtype: Callable[..., POSTaggedWordList | T]
|
64
|
+
"""
|
65
|
+
|
66
|
+
@wraps(method)
|
67
|
+
def wrapper(*args: Any, **kwargs: Any) -> POSTaggedWordList | T:
|
68
|
+
signature = inspect.signature(method)
|
69
|
+
bound_args = signature.bind(*args, **kwargs)
|
70
|
+
bound_args.apply_defaults()
|
71
|
+
|
72
|
+
tokens = next(iter(bound_args.arguments.values()), None)
|
73
|
+
|
74
|
+
if is_pos_tags(tokens):
|
75
|
+
words, pos = zip(*tokens)
|
76
|
+
|
77
|
+
result = method(list(words), **kwargs)
|
78
|
+
|
79
|
+
if len(result) == len(tokens):
|
80
|
+
return [POSTaggedWord(word, pos_tag) for word, pos_tag in zip(cast(Iterable, result), pos)]
|
81
|
+
else:
|
82
|
+
original_pos_map = dict(zip([word.text.lower() if isinstance(word, Token) else word.lower() for word in words], pos))
|
83
|
+
|
84
|
+
print(words, pos, original_pos_map)
|
85
|
+
result_with_pos = [POSTaggedWord(word, original_pos_map[word.text.lower() if isinstance(word, Token) else word.lower()]) for word in cast(Iterable, result)]
|
86
|
+
|
87
|
+
return result_with_pos
|
88
|
+
else:
|
89
|
+
return method(tokens, **kwargs)
|
90
|
+
|
91
|
+
return wrapper
|
@@ -0,0 +1,141 @@
|
|
1
|
+
"""
|
2
|
+
Provides a set of type guard functions to safely and explicitly check the types of various token and workflow-related objects.
|
3
|
+
|
4
|
+
These functions help with runtime type checking and enable more precise type hinting and static analysis when working with linguistic data structures such as:
|
5
|
+
|
6
|
+
* POS-tagged word lists
|
7
|
+
* Collocates lists
|
8
|
+
* Sentences (token or string sequences)
|
9
|
+
* Workflow step definitions
|
10
|
+
* Token-based collocates
|
11
|
+
|
12
|
+
By using these type guards, code can branch safely based on the structure and types of input data, improving robustness and developer experience.
|
13
|
+
|
14
|
+
Example:
|
15
|
+
|
16
|
+
.. code-block:: python
|
17
|
+
|
18
|
+
if is_pos_tags(tokens):
|
19
|
+
# tokens is now narrowed to POSTaggedWordList type
|
20
|
+
process_pos_tags(tokens)
|
21
|
+
"""
|
22
|
+
|
23
|
+
from typing import Any, TypeGuard
|
24
|
+
from ..lib_types import POSTaggedWordList, CollocatesList, Sentences, Token, Workflow, POSTaggedWord, Bigram, Collocates
|
25
|
+
|
26
|
+
def is_pos_tags(tokens: Any) -> TypeGuard[POSTaggedWordList]:
|
27
|
+
"""
|
28
|
+
Checks if the input is a list of POS-tagged words (POSTaggedWordList).
|
29
|
+
|
30
|
+
:param tokens: The object to check.
|
31
|
+
:type tokens: Any
|
32
|
+
|
33
|
+
:return: True if `tokens` is a non-empty list where all elements are instances of POSTaggedWord, otherwise False.
|
34
|
+
:rtype: bool
|
35
|
+
"""
|
36
|
+
|
37
|
+
if not isinstance(tokens, list) or not tokens:
|
38
|
+
return False
|
39
|
+
return all(isinstance(item, POSTaggedWord) for item in tokens)
|
40
|
+
|
41
|
+
def is_collocates(tokens: Any) -> TypeGuard[CollocatesList]:
|
42
|
+
"""
|
43
|
+
Checks if the input is a list of collocate tuples, where each tuple contains strings or Token instances, cexcluding types like POSTaggedWord or Bigram.
|
44
|
+
|
45
|
+
:param tokens: The object to check.
|
46
|
+
:type tokens: Any
|
47
|
+
|
48
|
+
:return: True if `tokens` is a non-empty list of tuples of strings or Token instances (excluding POSTaggedWord and Bigram), otherwise False.
|
49
|
+
:rtype: bool
|
50
|
+
"""
|
51
|
+
|
52
|
+
if not isinstance(tokens, list) or not tokens:
|
53
|
+
return False
|
54
|
+
|
55
|
+
return all(
|
56
|
+
isinstance(item, tuple) and
|
57
|
+
not isinstance(item, POSTaggedWord) and
|
58
|
+
not isinstance(item, Bigram) and
|
59
|
+
all(
|
60
|
+
isinstance(word, str) or
|
61
|
+
isinstance(word, Token)
|
62
|
+
for word in item
|
63
|
+
)
|
64
|
+
for item in tokens
|
65
|
+
)
|
66
|
+
|
67
|
+
def is_sentence(tokens: Any) -> TypeGuard[Sentences]:
|
68
|
+
"""
|
69
|
+
Checks if the input is a list of sentences, where each sentence is either:
|
70
|
+
|
71
|
+
* A list of Token instances,
|
72
|
+
* A list of strings, or
|
73
|
+
* A list of POSTaggedWord instances.
|
74
|
+
|
75
|
+
:param tokens: The object to check.
|
76
|
+
:type tokens: Any
|
77
|
+
|
78
|
+
:return: True if `tokens` matches the described sentence structure, otherwise False.
|
79
|
+
:rtype: bool
|
80
|
+
"""
|
81
|
+
|
82
|
+
if not isinstance(tokens, list) or not tokens:
|
83
|
+
return False
|
84
|
+
|
85
|
+
return (
|
86
|
+
isinstance(tokens, list) and
|
87
|
+
all(
|
88
|
+
(
|
89
|
+
isinstance(item, list) and
|
90
|
+
(
|
91
|
+
all(isinstance(token, Token) for token in item) or
|
92
|
+
all(isinstance(token, str) for token in item)
|
93
|
+
)
|
94
|
+
) or is_pos_tags(item)
|
95
|
+
for item in tokens
|
96
|
+
)
|
97
|
+
)
|
98
|
+
|
99
|
+
def is_workflow(workflow: Any) -> TypeGuard[Workflow]:
|
100
|
+
"""
|
101
|
+
Checks if the input is a workflow structure, i.e., a non-empty list of dictionaries where each dictionary maps string method names to argument dictionaries with string keys.
|
102
|
+
|
103
|
+
:param workflow: The object to check.
|
104
|
+
:type workflow: Any
|
105
|
+
|
106
|
+
:return: True if `workflow` matches the workflow structure, otherwise False.
|
107
|
+
:rtype: bool
|
108
|
+
"""
|
109
|
+
|
110
|
+
if not isinstance(workflow, list) or not workflow:
|
111
|
+
return False
|
112
|
+
|
113
|
+
return all(
|
114
|
+
isinstance(method, dict) and
|
115
|
+
all(
|
116
|
+
isinstance(key, str) and
|
117
|
+
isinstance(value, dict) and
|
118
|
+
all(
|
119
|
+
isinstance(arg, str)
|
120
|
+
for arg in value.keys()
|
121
|
+
)
|
122
|
+
for key, value in method.items()
|
123
|
+
)
|
124
|
+
for method in workflow
|
125
|
+
)
|
126
|
+
|
127
|
+
def is_token_collocates(collocates: Collocates) -> TypeGuard[Collocates[Token]]:
|
128
|
+
"""
|
129
|
+
Checks if the input collocates tuple consists exclusively of Token instances and excludes Bigram and POSTaggedWord types.
|
130
|
+
|
131
|
+
:param collocates: The collocates tuple to check.
|
132
|
+
:type collocates: Collocates
|
133
|
+
|
134
|
+
:return: True if all elements in `collocates` are Token instances and not Bigram or POSTaggedWord, otherwise False.
|
135
|
+
:rtype: bool
|
136
|
+
"""
|
137
|
+
|
138
|
+
return isinstance(collocates, tuple) and not isinstance(collocates, Bigram) and not isinstance(collocates, POSTaggedWord) and all(
|
139
|
+
isinstance(token, Token)
|
140
|
+
for token in collocates
|
141
|
+
)
|
dstk/hooks/__init__.py
ADDED
dstk/hooks/hook_tools.py
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
"""
|
2
|
+
Module providing the Hook class for wrapping callable methods with a strict single-argument interface.
|
3
|
+
|
4
|
+
The Hook class allows encapsulating any callable that accepts exactly one argument, enforcing this constraint at runtime. It supports invocation, argument validation, and renaming of the hook instance.
|
5
|
+
|
6
|
+
This module is useful for building extensible and modular modela where hooks act as customizable processing steps.
|
7
|
+
"""
|
8
|
+
|
9
|
+
from __future__ import annotations
|
10
|
+
from typing import Any, Callable
|
11
|
+
from inspect import signature, Signature, Parameter
|
12
|
+
from copy import deepcopy
|
13
|
+
from collections.abc import ValuesView
|
14
|
+
|
15
|
+
class Hook:
|
16
|
+
"""
|
17
|
+
Represents a callable hook that wraps a single-argument function.
|
18
|
+
|
19
|
+
A Hook encapsulates a method that must accept exactly one argument, enabling
|
20
|
+
modular processing steps or callbacks within workflows or pipelines.
|
21
|
+
|
22
|
+
:param name: A descriptive name for the hook.
|
23
|
+
:type name: str
|
24
|
+
:param method: A callable that takes exactly one argument and performs some operation
|
25
|
+
:type method: Callable[[Any], Any]
|
26
|
+
|
27
|
+
Usage:
|
28
|
+
|
29
|
+
.. code-block:: python
|
30
|
+
|
31
|
+
CustomHook = Hook("example_hook", some_function)
|
32
|
+
result = CustomHook(data) # Calls some_function(data)
|
33
|
+
"""
|
34
|
+
|
35
|
+
def __init__(self, name: str, method: Callable[[Any], Any]):
|
36
|
+
"""
|
37
|
+
Initializes WorkflowBuilder with given attributes.
|
38
|
+
"""
|
39
|
+
|
40
|
+
self.name: str = name
|
41
|
+
self.method: Callable = method
|
42
|
+
|
43
|
+
def __call__(self, *args, **kwargs) -> Any:
|
44
|
+
"""
|
45
|
+
Invokes the wrapped method with the provided arguments.
|
46
|
+
|
47
|
+
Ensures that the wrapped method accepts exactly one argument before calling it. Raises a ValueError if the method signature does not conform to this requirement.
|
48
|
+
|
49
|
+
:param args: Positional arguments to pass to the wrapped method.
|
50
|
+
:param kwargs: Keyword arguments to pass to the wrapped method.
|
51
|
+
|
52
|
+
:return: The result of calling the wrapped method with the given arguments.
|
53
|
+
:rtype: Any
|
54
|
+
|
55
|
+
:raises ValueError: If the wrapped method does not accept exactly one argument.
|
56
|
+
"""
|
57
|
+
if not self._check_args():
|
58
|
+
raise ValueError("A hook must accept exactly one argument")
|
59
|
+
|
60
|
+
return self.method(*args, **kwargs)
|
61
|
+
|
62
|
+
def _check_args(self) -> bool:
|
63
|
+
"""
|
64
|
+
Checks whether the wrapped method accepts exactly one argument (positional or keyword).
|
65
|
+
|
66
|
+
:return: True if the method accepts exactly one argument, False otherwise.
|
67
|
+
:rtype: bool
|
68
|
+
"""
|
69
|
+
sig: Signature = signature(self.method)
|
70
|
+
params: ValuesView[Parameter] = sig.parameters.values()
|
71
|
+
|
72
|
+
normalized_params: list[Parameter] = [param for param in params if param.kind in (param.POSITIONAL_ONLY, param.POSITIONAL_OR_KEYWORD, param.KEYWORD_ONLY)]
|
73
|
+
|
74
|
+
return True if len(normalized_params) == 1 else False
|
75
|
+
|
76
|
+
def rename(self, new_name: str) -> Hook:
|
77
|
+
"""
|
78
|
+
Creates a new Hook instance with the same method but a different name.
|
79
|
+
|
80
|
+
:param new_name: The new name for the hook.
|
81
|
+
:type new_name: str
|
82
|
+
|
83
|
+
:return: A new Hook instance with the updated name.
|
84
|
+
:rtype: Hook
|
85
|
+
"""
|
86
|
+
new_instance: Hook = deepcopy(self)
|
87
|
+
new_instance.name = new_name
|
88
|
+
|
89
|
+
return new_instance
|
@@ -0,0 +1,40 @@
|
|
1
|
+
"""
|
2
|
+
This module provides hooks and utilities for converting between different data types used in the processing model.
|
3
|
+
|
4
|
+
Currently, it includes a hook to convert trained word embedding models (Word2Vec or FastText) into pandas DataFrames, enabling easier manipulation and analysis of embeddings.
|
5
|
+
|
6
|
+
Additional conversion hooks can be added in the future to support other type transformations.
|
7
|
+
"""
|
8
|
+
|
9
|
+
import pandas as pd
|
10
|
+
import numpy as np
|
11
|
+
from .hook_tools import Hook
|
12
|
+
|
13
|
+
from ..lib_types import Words, ndarray, DataFrame, Word2Vec, FastText, NeuralModels
|
14
|
+
|
15
|
+
def model_to_dataframe(model: NeuralModels) -> DataFrame:
|
16
|
+
"""
|
17
|
+
Converts a trained Word2Vec or FastText model into a DataFrame of word embeddings.
|
18
|
+
|
19
|
+
:param model: A trained Word2Vec or FastText model.
|
20
|
+
:type model: NeuralModels
|
21
|
+
|
22
|
+
:return: A DataFrame containing the word embeddings and their associated labels.
|
23
|
+
:rtype: DataFrame
|
24
|
+
"""
|
25
|
+
|
26
|
+
word_vectors: ndarray
|
27
|
+
labels: list[str]
|
28
|
+
|
29
|
+
if isinstance(model, Word2Vec):
|
30
|
+
word_vectors = model.wv[model.wv.index_to_key]
|
31
|
+
labels = list(model.wv.index_to_key)
|
32
|
+
elif isinstance(model, FastText):
|
33
|
+
words: Words[str] = model.words
|
34
|
+
word_vectors = np.array([model[word] for word in words])
|
35
|
+
labels = words
|
36
|
+
|
37
|
+
return pd.DataFrame(word_vectors, index=labels)
|
38
|
+
|
39
|
+
ModelToDataframe: Hook = Hook(name="ModelToDataframe", method=model_to_dataframe)
|
40
|
+
|
dstk/lib_types/__init__.py
CHANGED
@@ -2,8 +2,7 @@ from .spacy_types import *
|
|
2
2
|
from .sklearn_types import *
|
3
3
|
from .numpy_types import *
|
4
4
|
from .pandas_types import *
|
5
|
-
from .matplotlib_types import *
|
6
5
|
from .fasttext_types import *
|
7
6
|
from .gensim_types import *
|
8
|
-
from .
|
9
|
-
from .
|
7
|
+
from .dstk_types import *
|
8
|
+
from .plotly_types import *
|
dstk/lib_types/dstk_types.py
CHANGED
@@ -1,26 +1,198 @@
|
|
1
|
-
from typing import TypeAlias,
|
2
|
-
from .spacy_types import
|
1
|
+
from typing import TypeAlias, TypeVar, Any, TypedDict, NotRequired, NamedTuple, Generator
|
2
|
+
from .spacy_types import Token
|
3
3
|
from .sklearn_types import csc_matrix, csr_matrix
|
4
4
|
from .numpy_types import ndarray, NDArray, str_
|
5
5
|
from .pandas_types import Index
|
6
|
+
from .fasttext_types import FastText
|
7
|
+
from .gensim_types import Word2Vec
|
8
|
+
from collections import Counter
|
6
9
|
|
7
|
-
|
8
|
-
|
9
|
-
POSIterator: TypeAlias = list[tuple[Token, str]]
|
10
|
-
SentenceIterator: TypeAlias = list[Span] | list[list[Token]] | list[POSIterator]
|
11
|
-
TextIterator: TypeAlias = list[str] | list[tuple[str, str]] | list[list[tuple[str, str]]]
|
10
|
+
#: Numeric types accepted (integer or float).
|
11
|
+
Number: TypeAlias = int | float
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
POSTags: TypeAlias = list[tuple[Token | str, str]]
|
16
|
-
Function = TypeVar("Function", bound=Callable[..., object])
|
13
|
+
#: A generic type variable for words, bounded to str or spaCy Token.
|
14
|
+
Word = TypeVar("Word", bound=str | Token)
|
17
15
|
|
18
|
-
|
19
|
-
|
16
|
+
#: A list of words (strings or spaCy Tokens).
|
17
|
+
Words: TypeAlias = list[Word]
|
20
18
|
|
21
|
-
|
19
|
+
#: A tuple representing a group of collocates (words).
|
20
|
+
Collocates: TypeAlias = tuple[Word, ...]
|
21
|
+
|
22
|
+
#: A list of collocate tuples.
|
23
|
+
CollocatesList = list[Collocates]
|
24
|
+
|
25
|
+
class POSTaggedWord(NamedTuple):
|
26
|
+
"""
|
27
|
+
Represents a word paired with its Part-Of-Speech (POS) tag.
|
28
|
+
|
29
|
+
:param word: The word, either as a string or spaCy Token.
|
30
|
+
:type word: str or Token
|
31
|
+
:param pos: The POS tag of the word.
|
32
|
+
:type pos: str
|
33
|
+
"""
|
34
|
+
|
35
|
+
word: str | Token
|
36
|
+
pos: str
|
37
|
+
|
38
|
+
#: A list of POS-tagged words.
|
39
|
+
POSTaggedWordList: TypeAlias = list[POSTaggedWord]
|
40
|
+
|
41
|
+
class Bigram(NamedTuple):
|
42
|
+
"""
|
43
|
+
Represents a bigram collocation between two words.
|
44
|
+
|
45
|
+
:param collocate: The collocate word.
|
46
|
+
:type collocate: str or Token
|
47
|
+
:param target_word: The target word in the bigram.
|
48
|
+
:type target_word: str
|
49
|
+
"""
|
50
|
+
|
51
|
+
collocate: str | Token
|
52
|
+
target_word: str
|
53
|
+
|
54
|
+
#: A list of bigram tuples.
|
55
|
+
BigramList: TypeAlias = list[Bigram]
|
56
|
+
|
57
|
+
#: Directed collocates represented as a tuple of a word and a pair of directional tags.
|
58
|
+
DirectedCollocates: TypeAlias = tuple[Word, tuple[str, str]]
|
59
|
+
|
60
|
+
#: A list of directed collocates.
|
61
|
+
DirectedCollocateList: TypeAlias = list[DirectedCollocates]
|
62
|
+
|
63
|
+
#: Union type of all tagged word lists.
|
64
|
+
TaggedWordsList: TypeAlias = CollocatesList | DirectedCollocateList | POSTaggedWordList | BigramList
|
65
|
+
|
66
|
+
#: A list of sentences, where each sentence is a list of words.c
|
67
|
+
WordSenteces: TypeAlias = list[Words]
|
68
|
+
#: A list of tagged sentences, each containing tagged words.c
|
69
|
+
TaggedSentences: TypeAlias = list[TaggedWordsList]
|
70
|
+
|
71
|
+
#: Union type representing either plain or tagged sentences.
|
72
|
+
Sentences: TypeAlias = WordSenteces | TaggedSentences
|
73
|
+
|
74
|
+
|
75
|
+
#: A tuple representing a neighboring word and its association score.
|
76
|
+
class Neighbor(NamedTuple):
|
77
|
+
word: str
|
78
|
+
score: float
|
79
|
+
#: A list of neighboring words with scores.
|
80
|
+
Neighbors: TypeAlias = list[Neighbor]
|
81
|
+
|
82
|
+
#: A union of neural language model types.
|
83
|
+
NeuralModels: TypeAlias = Word2Vec | FastText
|
84
|
+
|
85
|
+
#: A counter mapping words (strings) to their frequency counts.
|
86
|
+
WordCounts: TypeAlias = Counter[str]
|
87
|
+
|
88
|
+
#: A union of matrix types from SciPy or NumPy.
|
22
89
|
Matrix: TypeAlias = csr_matrix | csc_matrix | ndarray
|
90
|
+
|
91
|
+
#: Labels used in pandas DataFrames, representing index or column labels.
|
92
|
+
#:
|
93
|
+
#: This can be a NumPy ndarray of strings, a pandas Index, a list of strings, or None.
|
23
94
|
Labels: TypeAlias = NDArray[str_] | Index | list[str] | None
|
24
95
|
|
25
|
-
|
26
|
-
|
96
|
+
StepConfig = TypedDict(
|
97
|
+
"StepConfig",
|
98
|
+
{
|
99
|
+
"include": NotRequired[list[str] | str],
|
100
|
+
"exclude": NotRequired[dict[str, int]],
|
101
|
+
"repeat": bool,
|
102
|
+
"chaining": bool,
|
103
|
+
"step_name": str
|
104
|
+
},
|
105
|
+
total=True
|
106
|
+
)
|
107
|
+
"""
|
108
|
+
Configuration for a processing step in a workflow.
|
109
|
+
|
110
|
+
:param include: Methods to include, either a list of strings or a single string.
|
111
|
+
:type include: list[str] or str, optional
|
112
|
+
:param exclude: Methods to exclude, as a dictionary mapping strings to integers.
|
113
|
+
:type exclude: dict[str, int], optional
|
114
|
+
:param repeat: Whether the a method can be used more than once.
|
115
|
+
:type repeat: bool
|
116
|
+
:param chaining: Whether method cchaining is enabled.
|
117
|
+
:type chaining: bool
|
118
|
+
:param step_name: The name of the step.
|
119
|
+
:type step_name: str
|
120
|
+
"""
|
121
|
+
|
122
|
+
WorkflowTemplate = TypedDict(
|
123
|
+
"WorkflowTemplate",
|
124
|
+
{
|
125
|
+
"steps": dict[int, StepConfig],
|
126
|
+
"base_type": str,
|
127
|
+
"triggers": dict[str, str]
|
128
|
+
}
|
129
|
+
)
|
130
|
+
"""
|
131
|
+
Template for an entire workflow, consisting of steps, a base type and triggers.
|
132
|
+
|
133
|
+
:param steps: Mapping from step numbers to step configurations.
|
134
|
+
:type steps: dict[int, StepConfig]
|
135
|
+
:param base_type: The base type of the workflow.
|
136
|
+
:type base_type: str
|
137
|
+
:param triggers: Mapping from method names to the data types they produce. When a method changes the current data type (the default return type),the corresponding trigger activates rules that enable or disable subsequent methods.
|
138
|
+
:type triggers: dict[str, str]
|
139
|
+
"""
|
140
|
+
|
141
|
+
#: A workflow is a list of ordered steps, where each step is a dictionary
|
142
|
+
#: mapping method names to their keyword arguments.
|
143
|
+
Workflow: TypeAlias = list[dict[str, dict[str, Any]]]
|
144
|
+
#: A stage workflow contains multiple workflows organized by module names.
|
145
|
+
#: Each key is a module name (e.g., 'tokenizer', 'ngrams', 'text_processor'),
|
146
|
+
#: and the value is the workflow steps for that module.
|
147
|
+
StageWorkflow: TypeAlias = dict[str, Workflow]
|
148
|
+
#: Mapping from stage names to their corresponding workflow templates.
|
149
|
+
#:
|
150
|
+
#: Each key is a stage name (a string identifying a module),
|
151
|
+
#: and the value is a `WorkflowTemplate` describing the processing steps and triggers
|
152
|
+
#: allowed in that stage.
|
153
|
+
StageTemplate: TypeAlias = dict[str, WorkflowTemplate]
|
154
|
+
#: Mapping from stage indices (integers) to sets of module names allowed in that stage.
|
155
|
+
#:
|
156
|
+
#: Each key is a stage number, and the value is a set of module names (strings) that
|
157
|
+
#: are enabled or active during that stage of the stage workflow.
|
158
|
+
StageModules: TypeAlias = dict[int, set[str]]
|
159
|
+
|
160
|
+
ExcludedMethods = TypedDict(
|
161
|
+
"ExcludedMethods",
|
162
|
+
{
|
163
|
+
"exclude": list[str] | str
|
164
|
+
}
|
165
|
+
)
|
166
|
+
"""
|
167
|
+
Specifies methods to exclude by name.
|
168
|
+
|
169
|
+
:param exclude: A list of method names or a single method name to exclude.
|
170
|
+
:type exclude: list[str] or str
|
171
|
+
"""
|
172
|
+
|
173
|
+
#: Template defining rules for excluding methods once a specific type is triggered.
|
174
|
+
#:
|
175
|
+
#: The outer dictionary keys are module names (e.g., 'tokenizer', 'text_processor'),
|
176
|
+
#: and the values specify which methods should be excluded in that module.
|
177
|
+
#:
|
178
|
+
#: For example, when the data type changes to 'POSTaggedWordList', these rules
|
179
|
+
#: prevent further usage of specific methods like 'pos_tagger' in the tokenizer module
|
180
|
+
RulesTemplate: TypeAlias = dict[str, ExcludedMethods]
|
181
|
+
|
182
|
+
class StepResult(NamedTuple):
|
183
|
+
"""
|
184
|
+
Represents the result of executing a single workflow or model step.
|
185
|
+
|
186
|
+
:param name: The name of thec step.
|
187
|
+
:param result: The output produced by the step.
|
188
|
+
"""
|
189
|
+
|
190
|
+
name: str
|
191
|
+
result: Any
|
192
|
+
|
193
|
+
|
194
|
+
#: Generator that yields `StepResult` objects, each representing the name and result of a workflow step.
|
195
|
+
StepGenerator: TypeAlias = Generator[StepResult, None, None]
|
196
|
+
|
197
|
+
#: Generator that yields results of workflow steps without step metadata.
|
198
|
+
ResultGenerator: TypeAlias = Generator[Any, None, None]
|
@@ -0,0 +1 @@
|
|
1
|
+
from plotly.graph_objects import Figure
|
dstk/method_index.py
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
INDEX = {
|
2
|
+
"tokenizer": {
|
3
|
+
"apply_model": {
|
4
|
+
"input": "str",
|
5
|
+
"output": "Doc"
|
6
|
+
},
|
7
|
+
"get_tokens": {
|
8
|
+
"input": "Doc",
|
9
|
+
"output": "Words[Tokens]"
|
10
|
+
},
|
11
|
+
"get_sentences": {
|
12
|
+
"input": "Doc",
|
13
|
+
"output": "Check it"
|
14
|
+
},
|
15
|
+
"remove_stop_words": {
|
16
|
+
"input": "Words[Token]",
|
17
|
+
"output": "Words[Token]"
|
18
|
+
},
|
19
|
+
"alphanumeric_raw_tokenizer": {
|
20
|
+
"input": "Words[Token]",
|
21
|
+
"output": "Words[Token]"
|
22
|
+
},
|
23
|
+
"filter_by_pos": {
|
24
|
+
"input": "Words[Token]",
|
25
|
+
"output": "Words[Token]"
|
26
|
+
},
|
27
|
+
"pos_tagger": {
|
28
|
+
"input": "Words[Token]",
|
29
|
+
"output": "POSTaggedText"
|
30
|
+
},
|
31
|
+
}
|
32
|
+
}
|
dstk/models/__init__.py
ADDED