dstklib 1.0.1__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstk/__init__.py +10 -12
- dstk/adaptors/__init__.py +2 -0
- dstk/adaptors/adaptors.py +91 -0
- dstk/adaptors/typeguards.py +141 -0
- dstk/hooks/__init__.py +2 -0
- dstk/hooks/hook_tools.py +89 -0
- dstk/hooks/type_conversion.py +40 -0
- dstk/lib_types/__init__.py +2 -3
- dstk/lib_types/dstk_types.py +188 -16
- dstk/lib_types/plotly_types.py +1 -0
- dstk/method_index.py +32 -0
- dstk/models/__init__.py +2 -0
- dstk/models/model_tools.py +83 -0
- dstk/models/models.py +191 -0
- dstk/modules/__init__.py +10 -0
- dstk/modules/count_models.py +91 -0
- dstk/modules/data_visualization/__init__.py +2 -0
- dstk/modules/data_visualization/clustering.py +129 -0
- dstk/modules/data_visualization/embeddings.py +101 -0
- dstk/modules/geometric_distance.py +114 -0
- dstk/modules/ngrams.py +156 -0
- dstk/modules/predict_models.py +109 -0
- dstk/modules/text_matrix_builder.py +55 -0
- dstk/modules/text_processor.py +100 -0
- dstk/modules/tokenizer.py +139 -0
- dstk/modules/weight_matrix.py +65 -0
- dstk/templates/__init__.py +2 -0
- dstk/templates/rules.py +59 -0
- dstk/templates/templates.py +231 -0
- dstk/workflows/__init__.py +2 -0
- dstk/workflows/stage_workflows.py +55 -0
- dstk/workflows/workflow_tools.py +383 -0
- dstklib-2.0.0.dist-info/METADATA +377 -0
- dstklib-2.0.0.dist-info/RECORD +43 -0
- dstk/collocations.py +0 -121
- dstk/count_models.py +0 -112
- dstk/geometric_distance.py +0 -107
- dstk/lib_types/matplotlib_types.py +0 -4
- dstk/lib_types/nltk_types.py +0 -1
- dstk/matrix_base.py +0 -113
- dstk/pipeline_tools.py +0 -27
- dstk/pipelines.py +0 -114
- dstk/plot_embeddings.py +0 -240
- dstk/predict_models.py +0 -189
- dstk/text_matrix_builder.py +0 -87
- dstk/text_processor.py +0 -450
- dstk/weight_matrix.py +0 -71
- dstk/workflow_tools.py +0 -257
- dstklib-1.0.1.dist-info/METADATA +0 -360
- dstklib-1.0.1.dist-info/RECORD +0 -28
- {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/LICENSE +0 -0
- {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/WHEEL +0 -0
- {dstklib-1.0.1.dist-info → dstklib-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,114 @@
|
|
1
|
+
"""
|
2
|
+
This module provides functions to compute geometric distance and similarity measures between word embeddings, enabling semantic comparison of words in vector space.
|
3
|
+
|
4
|
+
Available metrics include:
|
5
|
+
|
6
|
+
* Euclidean distance
|
7
|
+
* Manhattan distance
|
8
|
+
* Cosine similarity
|
9
|
+
|
10
|
+
Additionally, it offers a method to find the nearest semantic neighbors of a given word based on specified distance or similarity metrics using scikit-learn's NearestNeighbors.
|
11
|
+
|
12
|
+
All functions operate on word embeddings represented as Pandas DataFrames indexed by words, facilitating easy integration with common NLP and Computational Linguistic workflows.
|
13
|
+
"""
|
14
|
+
|
15
|
+
from sklearn.neighbors import NearestNeighbors
|
16
|
+
import numpy as np
|
17
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
18
|
+
|
19
|
+
from ..lib_types import ndarray, Series, DataFrame, Neighbors, Neighbor
|
20
|
+
|
21
|
+
def euclidean_distance(embeddings: DataFrame, first_word: str, second_word: str) -> float:
|
22
|
+
"""
|
23
|
+
Computes the Euclidean distance between the embeddings of two words.
|
24
|
+
|
25
|
+
:param embeddings: A dataframe containing the word embeddings.
|
26
|
+
:type embeddings: DataFrame
|
27
|
+
:param first_word: The first word in the pair.
|
28
|
+
:type first_word: str
|
29
|
+
:param second_word: The second word in the pair.
|
30
|
+
:type second_word: str
|
31
|
+
|
32
|
+
:returns: The Euclidean distance between the first and second word.
|
33
|
+
:rtype: float
|
34
|
+
"""
|
35
|
+
|
36
|
+
first_word_vector: Series = embeddings.loc[first_word]
|
37
|
+
second_word_vector: Series = embeddings.loc[second_word]
|
38
|
+
|
39
|
+
return float(np.linalg.norm(first_word_vector - second_word_vector))
|
40
|
+
|
41
|
+
def manhattan_distance(embeddings: DataFrame, first_word: str, second_word: str) -> float:
|
42
|
+
"""
|
43
|
+
Computes the Manhattan distance between the embeddings of two words.
|
44
|
+
|
45
|
+
:param embeddings: A dataframe containing the word embeddings.
|
46
|
+
:type embeddings: DataFrame
|
47
|
+
:param first_word: The first word in the pair.
|
48
|
+
:type first_word: str
|
49
|
+
:param second_word: The second word in the pair.
|
50
|
+
:type second_word: str
|
51
|
+
|
52
|
+
:returns: The Manhattan distance between the first and second word.
|
53
|
+
:rtype: float
|
54
|
+
"""
|
55
|
+
|
56
|
+
first_word_vector: Series = embeddings.loc[first_word]
|
57
|
+
second_word_vector: Series = embeddings.loc[second_word]
|
58
|
+
|
59
|
+
return np.sum(np.abs(first_word_vector - second_word_vector))
|
60
|
+
|
61
|
+
def cos_similarity(embeddings: DataFrame, first_word: str, second_word: str) -> float:
|
62
|
+
"""
|
63
|
+
Computes the cosine similarity between the embeddings of two words.
|
64
|
+
|
65
|
+
:param embeddings: A dataframe containing the word embeddings.
|
66
|
+
:type embeddings: DataFrame
|
67
|
+
:param first_word: The first word in the pair.
|
68
|
+
:type first_word: str
|
69
|
+
:param second_word: The second word in the pair.
|
70
|
+
:type second_word: str
|
71
|
+
|
72
|
+
:returns: The cosine similarity between the first and second word.
|
73
|
+
:rtype: float
|
74
|
+
"""
|
75
|
+
|
76
|
+
first_word_vector: ndarray = np.array(embeddings.loc[first_word]).reshape(1, -1)
|
77
|
+
second_word_vector: ndarray = np.array(embeddings.loc[second_word]).reshape(1, -1)
|
78
|
+
|
79
|
+
cos_sim: ndarray = cosine_similarity(first_word_vector, second_word_vector)
|
80
|
+
|
81
|
+
return cos_sim[0][0]
|
82
|
+
|
83
|
+
def nearest_neighbors(embeddings: DataFrame, word: str, metric: str = "cosine", n_words: int = 5, **kwargs) -> Neighbors:
|
84
|
+
"""
|
85
|
+
Returns the top N most semantically similar words to a given target word, based on the specified distance or similarity metric.
|
86
|
+
|
87
|
+
:param embeddings: A dataframe containing the word embeddings.
|
88
|
+
:type embeddings: DataFrame
|
89
|
+
:param word: The target word to find neighbors for.
|
90
|
+
:type word: str
|
91
|
+
:param metric: The distance or similarity metric to use (e.g., 'cosine', 'euclidean'). Defaults to 'cosine'.
|
92
|
+
:type metric: str
|
93
|
+
:param n_words: Number of nearest neighbors to return. Defaults to 5.
|
94
|
+
:type of n_words: int
|
95
|
+
:param kwargs: Additional keyword arguments to pass to sklearn's NearestNeighbors. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
|
96
|
+
|
97
|
+
:returns: A list of `Neighbor` namedtuples, one for each word close to the target word.
|
98
|
+
:rtype: Neighbors
|
99
|
+
"""
|
100
|
+
|
101
|
+
neighbors: NearestNeighbors = NearestNeighbors(n_neighbors=n_words, algorithm="auto", metric=metric, **kwargs)
|
102
|
+
neighbors.fit(embeddings.to_numpy())
|
103
|
+
|
104
|
+
word_vector: Series = embeddings.loc[word]
|
105
|
+
|
106
|
+
distances: ndarray
|
107
|
+
indices: ndarray
|
108
|
+
distances, indices = neighbors.kneighbors([word_vector], n_neighbors=n_words + 1)
|
109
|
+
|
110
|
+
neighbor_tuples = zip(indices[0], distances[0])
|
111
|
+
|
112
|
+
results: Neighbors = [Neighbor(embeddings.index[index], 1 - distance) for index, distance in neighbor_tuples if embeddings.index[index] != word]
|
113
|
+
|
114
|
+
return results
|
dstk/modules/ngrams.py
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
"""
|
2
|
+
This module provides utilities for extracting context-based collocates, bigrams, and n-grams from a list of words or tokens. It is designed to support both raw string tokens and spaCy `Token` objects, allowing for flexibility in preprocessing pipelines.
|
3
|
+
|
4
|
+
The functions in this module focus on identifying co-occurrence patterns around a specific target word, as well as extracting fixed-length n-grams from sequences of tokens. This is useful for tasks such as collocation analysis, feature engineering for machine learning models, and exploratory corpus analysis.
|
5
|
+
|
6
|
+
Core functionalities include:
|
7
|
+
|
8
|
+
* Extracting left and right context windows around a target word
|
9
|
+
* Creating directed and undirected bigrams centered on a target
|
10
|
+
* Generating fixed-length n-grams from a sequence of words
|
11
|
+
* Counting the frequency of collocated words in context windows
|
12
|
+
|
13
|
+
The module is compatible with both plain string tokens and spaCy Tokens.
|
14
|
+
"""
|
15
|
+
|
16
|
+
|
17
|
+
from collections import Counter
|
18
|
+
import pandas as pd
|
19
|
+
|
20
|
+
from typing import Generator
|
21
|
+
from ..lib_types import Words, WordCounts, DataFrame, CollocatesList, BigramList, DirectedCollocateList, Token, Bigram
|
22
|
+
|
23
|
+
|
24
|
+
def _find_contexts(words: Words, target_word: str, window_size: tuple[int, int]) -> Generator[tuple[Words, Words], None, None]:
|
25
|
+
"""
|
26
|
+
Yields left and right contexts for each occurrence of a target word in a list of words.
|
27
|
+
|
28
|
+
:param words: A list of words (strings or spaCy Tokens).
|
29
|
+
:type words: Words
|
30
|
+
|
31
|
+
:param target_word: The word to find within the list.
|
32
|
+
:type target_word: str
|
33
|
+
|
34
|
+
:param window_size: A tuple representing the number of words to include before and after the target word.
|
35
|
+
:type window_size: tuple[int, int]
|
36
|
+
|
37
|
+
:return: An Geberator of tuples, where each tuple contains the left and right context around a matched word.
|
38
|
+
:rtype: Generator[tuple[Words, Words], None, None]
|
39
|
+
"""
|
40
|
+
|
41
|
+
for index, word in enumerate(words):
|
42
|
+
word_to_compare = word.text if isinstance(word, Token) else word
|
43
|
+
if word_to_compare == target_word:
|
44
|
+
start: int = max(0, index - window_size[0])
|
45
|
+
end: int = min(len(words), index + window_size[1] + 1)
|
46
|
+
|
47
|
+
left_context: Words = words[start:index]
|
48
|
+
right_context: Words = words[index + 1:end]
|
49
|
+
|
50
|
+
yield (left_context, right_context)
|
51
|
+
|
52
|
+
def extract_collocates(words: Words, target_word: str, window_size: tuple[int, int]) -> CollocatesList:
|
53
|
+
"""
|
54
|
+
Extracts the context words of the target word, returned as tuples whose lenght corresponds to the specified window_size.
|
55
|
+
|
56
|
+
:param words: A list of spaCy tokens or words represented as strings.
|
57
|
+
:type words: Words
|
58
|
+
:param target_word: The word to find within the list.
|
59
|
+
:type target_word: str
|
60
|
+
:param window_size: A tuple indicating how many words to capture to the left and right of the target.
|
61
|
+
:type window_size: tuple[int, int]
|
62
|
+
|
63
|
+
:returns: A list of collocates (left and right context words) of the target word.
|
64
|
+
:rtype: CollocatesList
|
65
|
+
"""
|
66
|
+
|
67
|
+
return [tuple(left + right) for left, right in _find_contexts(words, target_word, window_size)]
|
68
|
+
|
69
|
+
def extract_directed_bigrams(words: Words, target_word: str, window_size: tuple[int, int]) -> DirectedCollocateList:
|
70
|
+
"""
|
71
|
+
Extracts directed bigrams (left and right context words) around a target word.
|
72
|
+
|
73
|
+
For each occurrence of `target_word` in the input `words`, this function collects two types of bigrams:
|
74
|
+
* Left bigrams: (context_word, ("L", target_word))
|
75
|
+
* Right bigrams: (context_word, ("R", target_word))
|
76
|
+
|
77
|
+
:param words: A list of spaCy tokens or words represented as strings.
|
78
|
+
:type words: Words
|
79
|
+
|
80
|
+
:param target_word: The word to search for in the list.
|
81
|
+
:type target_word: str
|
82
|
+
|
83
|
+
:param window_size: A tuple indicating how many words to capture to the left and right of the target.
|
84
|
+
:type window_size: tuple[int, int]
|
85
|
+
|
86
|
+
:return: A list of directed bigrams in the form `(word, ("L" | "R", target_word))`.
|
87
|
+
:rtype: DirectedCollocateList
|
88
|
+
"""
|
89
|
+
bigrams: DirectedCollocateList = []
|
90
|
+
|
91
|
+
for left, right in _find_contexts(words, target_word, window_size):
|
92
|
+
bigrams.extend([(word, ("L", target_word)) for word in left])
|
93
|
+
bigrams.extend([(word, ("R", target_word)) for word in right])
|
94
|
+
|
95
|
+
return bigrams
|
96
|
+
|
97
|
+
def extract_undirected_bigrams(words: Words, target_word: str, window_size: tuple[int, int]) -> BigramList:
|
98
|
+
"""
|
99
|
+
Extracts undirected bigrams surrounding a target word.
|
100
|
+
|
101
|
+
For each occurrence of `target_word`, this function collects all context words within the specified window (both left and right), and forms a `Bigram` with:
|
102
|
+
|
103
|
+
* `collocate`: the context word
|
104
|
+
* `target_word`: the target word
|
105
|
+
|
106
|
+
:param words: A list of spaCy tokens or words represented as strings.
|
107
|
+
:type words: Words
|
108
|
+
|
109
|
+
:param target_word: The word to search for in the list.
|
110
|
+
:type target_word: str
|
111
|
+
|
112
|
+
:param window_size: A tuple indicating how many words to capture to the left and right of the target.
|
113
|
+
:type window_size: tuple[int, int]
|
114
|
+
|
115
|
+
:return: A list of `Bigram` namedtuples, one for each context word around each target occurrence.
|
116
|
+
:rtype: BigramList
|
117
|
+
"""
|
118
|
+
bigrams: BigramList = []
|
119
|
+
|
120
|
+
for left, right in _find_contexts(words, target_word, window_size):
|
121
|
+
bigrams.extend([Bigram(collocate=word, target_word=target_word) for word in left + right])
|
122
|
+
|
123
|
+
return bigrams
|
124
|
+
|
125
|
+
def extract_ngrams(words: Words, window_size: int) -> CollocatesList:
|
126
|
+
"""
|
127
|
+
Splits the tokens into groups of window_size consecutive words and joins each group into a string.
|
128
|
+
|
129
|
+
:param words: A list of spaCy tokens or words represented as strings.
|
130
|
+
:type words: Words
|
131
|
+
:param window_size: size of the square context window.
|
132
|
+
:type window_size: int
|
133
|
+
|
134
|
+
:return: A list of tuples, where each tuple contains `window_size` consecutive words from the input.
|
135
|
+
:rtype: CollocatesList
|
136
|
+
"""
|
137
|
+
|
138
|
+
ngrams: CollocatesList = [tuple(words[index:index + window_size]) for index in range(len(words) - window_size + 1)]
|
139
|
+
return ngrams
|
140
|
+
|
141
|
+
def count_collocates(collocates: CollocatesList) -> DataFrame:
|
142
|
+
"""
|
143
|
+
Counts the frequency of words in a list of collocations and returns the result as a DataFrame.
|
144
|
+
|
145
|
+
:param collocates: A list of collocations, where each collocation is a tuple of words.
|
146
|
+
:type collocates: CollocatesList
|
147
|
+
|
148
|
+
:return: A DataFrame with two columns: "word" and "count", sorted by frequency.
|
149
|
+
:rtype: DataFrame
|
150
|
+
"""
|
151
|
+
|
152
|
+
all_words: Words = [word.text if isinstance(word, Token) else word for collocation in collocates for word in collocation]
|
153
|
+
word_counts: WordCounts = Counter(all_words)
|
154
|
+
word_counts_df: DataFrame = pd.DataFrame(word_counts.items(), columns=["word", "count"])
|
155
|
+
|
156
|
+
return word_counts_df
|
@@ -0,0 +1,109 @@
|
|
1
|
+
"""
|
2
|
+
This module provides utilities to train, save, and load word embedding models using neural networks models such as Word2Vec (gensim) and FastText (fasttext library).
|
3
|
+
|
4
|
+
Functions include:
|
5
|
+
|
6
|
+
* *word2vec:* Train Word2Vec embeddings from a corpus file.
|
7
|
+
* *fastText:* Train FastText embeddings from a corpus file.
|
8
|
+
* *load_model:* Load a saved model from disk (supports Word2Vec .model and FastText .bin formats).
|
9
|
+
* *save_model:* Save a trained model to disk in the appropriate format.
|
10
|
+
|
11
|
+
Each function supports passing additional keyword arguments to fine-tune training and loading.
|
12
|
+
"""
|
13
|
+
|
14
|
+
from gensim.models import Word2Vec
|
15
|
+
import fasttext
|
16
|
+
from pathlib import Path
|
17
|
+
|
18
|
+
from ..lib_types import FastText, NeuralModels
|
19
|
+
|
20
|
+
def word2vec(path: str, **kwargs) -> Word2Vec:
|
21
|
+
"""
|
22
|
+
Creates word embeddings using the Word2Vec algorithm.
|
23
|
+
|
24
|
+
:param path: The path to a file conatining a list of sentences or collocations from which to build word embeddings.
|
25
|
+
:type path: str
|
26
|
+
:param kwargs: Additional keyword arguments to pass to gensim.models.Word2Vec. Common options include:
|
27
|
+
|
28
|
+
* **vector_size:** Size of the word embedding vectors.
|
29
|
+
* **workers:** Number of CPU cores to be used during the training process.
|
30
|
+
* **sg:** Training algorithm. 1 for skip-gram; 0 for CBOW (Continuous Bag of Words).
|
31
|
+
* **window (int):** Maximum distance between the current and predicted word.
|
32
|
+
* **min_count (int):** Ignores all words with total frequency lower than this.
|
33
|
+
|
34
|
+
For more information check: https://radimrehurek.com/gensim/models/word2vec.html
|
35
|
+
|
36
|
+
:returns: An instance of gensim's Word2Vec.
|
37
|
+
:rtype: Word2Vec
|
38
|
+
"""
|
39
|
+
|
40
|
+
return Word2Vec(
|
41
|
+
corpus_file=path,
|
42
|
+
**kwargs
|
43
|
+
)
|
44
|
+
|
45
|
+
def fastText(path: str, **kwargs) -> FastText:
|
46
|
+
"""
|
47
|
+
Creates word embeddings using the FastText algorithm.
|
48
|
+
|
49
|
+
:param path: The path to a file containing a list of sentences or collocations from which to build word embeddings.
|
50
|
+
:type path: str
|
51
|
+
:param kwargs: Additional keyword arguments to pass to fasttext.train_unsupervised. Common options include:
|
52
|
+
|
53
|
+
* **dim:** Size of the word embedding vectors.
|
54
|
+
* **model:** Training algorithm: skipgram or cbow (Continuous Bag of Words)
|
55
|
+
* **thread:** Number of CPU cores to be used during the training process.
|
56
|
+
|
57
|
+
For more information check: https://fasttext.cc/docs/en/options.html
|
58
|
+
|
59
|
+
:returns: An instance of fasttext's FastText.
|
60
|
+
:rtype: FastText
|
61
|
+
"""
|
62
|
+
|
63
|
+
return fasttext.train_unsupervised(
|
64
|
+
path,
|
65
|
+
**kwargs
|
66
|
+
)
|
67
|
+
|
68
|
+
def load_model(path: str) -> NeuralModels:
|
69
|
+
"""
|
70
|
+
Loads the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.
|
71
|
+
|
72
|
+
:param path: Path to the saved model file.
|
73
|
+
:type path: str
|
74
|
+
|
75
|
+
:returns: An instance of gensim's Word2Vec or fasttext's FastText.
|
76
|
+
:rtype: NeuralModels
|
77
|
+
"""
|
78
|
+
|
79
|
+
extension: str = Path(path).suffix.lower()
|
80
|
+
|
81
|
+
if extension == ".model":
|
82
|
+
return Word2Vec.load(path)
|
83
|
+
elif extension == ".bin":
|
84
|
+
return fasttext.load_model(path)
|
85
|
+
else:
|
86
|
+
raise ValueError(f"Model extension {extension} not recognized.")
|
87
|
+
|
88
|
+
def save_model(model: NeuralModels, path: str) -> str:
|
89
|
+
"""
|
90
|
+
Saves the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.
|
91
|
+
|
92
|
+
:param model: A trained Word2Vec or FastText model.
|
93
|
+
:type model: NeuralModels
|
94
|
+
:param path: The path (without extension) where to save the model.
|
95
|
+
:type path: str
|
96
|
+
|
97
|
+
:returns: An instance of gensim's Word2Vec or fasttext's FastText.
|
98
|
+
:rtype: NeuralModels
|
99
|
+
"""
|
100
|
+
full_path: Path = Path(path)
|
101
|
+
|
102
|
+
if isinstance(model, Word2Vec):
|
103
|
+
model.save(str(full_path.with_suffix(".model")))
|
104
|
+
elif isinstance(model, FastText):
|
105
|
+
model.save_model(str(full_path.with_suffix(".bin")))
|
106
|
+
else:
|
107
|
+
raise NotImplementedError(f"Model identifier type {type(model.__name__)} not yet supported")
|
108
|
+
|
109
|
+
return str(full_path.resolve())
|
@@ -0,0 +1,55 @@
|
|
1
|
+
"""
|
2
|
+
This module provides functions to construct common matrix representations used in text analysis and natural language processing.
|
3
|
+
|
4
|
+
Key features include:
|
5
|
+
|
6
|
+
* Creating a Document-Term Matrix (DTM) from a corpus of text, leveraging sklearn's CountVectorizer with customizable parameters such as stop word removal and n-gram range.
|
7
|
+
* Generating a Co-occurrence Matrix from a given Document-Term Matrix, capturing how frequently terms co-occur across documents.
|
8
|
+
|
9
|
+
These matrices are foundational for many NLP and Computational Linguistics tasks, including topic modeling, word embedding training, and network analysis. The output is provided as Pandas DataFrames for ease of analysis and integration with data science workflows.
|
10
|
+
"""
|
11
|
+
|
12
|
+
from sklearn.feature_extraction.text import CountVectorizer
|
13
|
+
import numpy as np
|
14
|
+
import pandas as pd
|
15
|
+
|
16
|
+
from ..lib_types import csr_matrix, DataFrame, ndarray
|
17
|
+
|
18
|
+
def create_dtm(corpus: list[str], **kwargs) -> DataFrame:
|
19
|
+
"""
|
20
|
+
Creates Document Term Matrix (DTM).
|
21
|
+
|
22
|
+
:param corpus: A list of sentences or collocations from which to build a matrix.
|
23
|
+
:type corpus: list[str]
|
24
|
+
:param kwargs: Additional keyword arguments to pass to sklearn's CountVectorizer. Common options include:
|
25
|
+
|
26
|
+
* **stop_words:** If provided, a list of stopwords to remove from the corpus.
|
27
|
+
* **ngram_range:** A tuple (min_n, max_n) specifying the range of n-grams to consider.
|
28
|
+
|
29
|
+
For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
|
30
|
+
|
31
|
+
:return: A Document Term Matrix (DTM).
|
32
|
+
:rtype: DataFrame.
|
33
|
+
"""
|
34
|
+
|
35
|
+
vectorizer: CountVectorizer = CountVectorizer(**kwargs)
|
36
|
+
|
37
|
+
dtm: csr_matrix = vectorizer.fit_transform(corpus)
|
38
|
+
|
39
|
+
return pd.DataFrame(dtm.toarray(), index=np.array(corpus), columns=vectorizer.get_feature_names_out())
|
40
|
+
|
41
|
+
def create_co_occurrence_matrix(dtm: DataFrame) -> DataFrame:
|
42
|
+
"""
|
43
|
+
Creates a Co-occurrence matrix from a Document Term Matrix (DTM).
|
44
|
+
|
45
|
+
:param dtm: A Document Term Matrix (DTM) from which to build a Co-occurrence matrix.
|
46
|
+
:type dtm: DataFrame
|
47
|
+
|
48
|
+
:return: A Co-occurrence matrix.
|
49
|
+
:rtype: DataFrame
|
50
|
+
"""
|
51
|
+
matrix: ndarray = dtm.to_numpy()
|
52
|
+
|
53
|
+
co_matrix: ndarray = matrix.T @ matrix
|
54
|
+
|
55
|
+
return pd.DataFrame(co_matrix, index=dtm.columns, columns=dtm.columns)
|
@@ -0,0 +1,100 @@
|
|
1
|
+
"""
|
2
|
+
This module provides utility functions for processing tokenized or lemmatized text represented as lists of strings
|
3
|
+
or POS-tagged tuples. It supports common text normalization and transformation tasks, such as lowercasing,
|
4
|
+
vocabulary extraction, and joining tokens into a single string. Additionally, it includes functionality for saving
|
5
|
+
processed text or tagged data to a file in plain text or CSV format.
|
6
|
+
|
7
|
+
Core functionalities include:
|
8
|
+
|
9
|
+
* Converting spaCy tokens to strings (with optional lemmatization)
|
10
|
+
* Lowercasing and vocabulary extraction
|
11
|
+
* Joining word lists into full text strings
|
12
|
+
* Saving word lists or (token, POS) pairs to disk in a consistent format
|
13
|
+
|
14
|
+
This module is useful for preparing text data for further analysis, modeling, or storage.
|
15
|
+
"""
|
16
|
+
|
17
|
+
from pathlib import Path
|
18
|
+
|
19
|
+
from ..lib_types.dstk_types import Words, POSTaggedWordList, Token, POSTaggedWord
|
20
|
+
|
21
|
+
def tokens_to_text(tokens: Words[Token], lemmatize: bool = False) -> Words[str]:
|
22
|
+
"""
|
23
|
+
Converts a list of spaCy Token objects to a list of words represented as strings.
|
24
|
+
|
25
|
+
:param tokens: A list of spaCy tokens.
|
26
|
+
:type tokens: Words[Token]
|
27
|
+
:param lemmatize: Whether to return the lemmatized form of each token. Defaults to False.
|
28
|
+
:type lemmatize: bool
|
29
|
+
|
30
|
+
:return: A list words represented as strings.
|
31
|
+
:rtype: Words[str]
|
32
|
+
"""
|
33
|
+
|
34
|
+
return [token.lemma_.lower() if lemmatize else token.text for token in tokens]
|
35
|
+
|
36
|
+
def to_lower(words: Words[str]) -> Words[str]:
|
37
|
+
"""
|
38
|
+
Returns a list of lower cased words.
|
39
|
+
|
40
|
+
:param words: A list words represented as strings.
|
41
|
+
:type words: Words[str]
|
42
|
+
|
43
|
+
:return: A list of words represented as strings.
|
44
|
+
:rtype: Words[str]
|
45
|
+
"""
|
46
|
+
|
47
|
+
return [word.lower() for word in words]
|
48
|
+
|
49
|
+
def get_vocabulary(words: Words[str]) -> Words[str]:
|
50
|
+
"""
|
51
|
+
Returns the vocabulary a text.
|
52
|
+
|
53
|
+
:param words: A list words represented as strings.
|
54
|
+
:type words: Words[str]
|
55
|
+
|
56
|
+
:return: A list of words represented as strings.
|
57
|
+
:rtype: Words[str]
|
58
|
+
"""
|
59
|
+
|
60
|
+
return sorted(set(words))
|
61
|
+
|
62
|
+
def join(words: Words[str]) -> str:
|
63
|
+
"""
|
64
|
+
Joins a list of strings into a single string text.
|
65
|
+
|
66
|
+
:param words: A list words represented as strings.
|
67
|
+
:type words: Words[str]
|
68
|
+
|
69
|
+
:return: A single string formed by concatenating the input words separated by spaces.
|
70
|
+
:rtype: Words[str]
|
71
|
+
"""
|
72
|
+
|
73
|
+
return " ".join(words)
|
74
|
+
|
75
|
+
def save_to_file(words: Words[str] | POSTaggedWordList, path: str) -> str:
|
76
|
+
"""
|
77
|
+
Saves a list of strings or (Token, POS) tuples in the specified path. If tokens is a list of strings, it saves each string in a new line. If it is a list of tuples, it saves each tuple in a new line as a pair or values separated by a comma, in a CSV format.
|
78
|
+
|
79
|
+
:param words: A list words represented as strings or a list of POSTaggedWord tuples.
|
80
|
+
:type words: Words[str] or POSTaggedWordList.
|
81
|
+
:param path: The path where to save the list of words.
|
82
|
+
:type path: str
|
83
|
+
|
84
|
+
:return: The path where the file was saved.
|
85
|
+
:rtype: str
|
86
|
+
"""
|
87
|
+
|
88
|
+
with open(path, "w") as file:
|
89
|
+
for word in words:
|
90
|
+
if type(word) == str:
|
91
|
+
file.write(word + "\n")
|
92
|
+
elif isinstance(word, POSTaggedWord):
|
93
|
+
if isinstance(word[0], str):
|
94
|
+
file.write(word[0] + "," + word[1] + "\n")
|
95
|
+
else:
|
96
|
+
raise ValueError("You can only use save_to_file with a POSTaggedWordList if word is of type of str.")
|
97
|
+
else:
|
98
|
+
raise ValueError("You can only use save_to_file with Words[srt] | POSTaggedWordList")
|
99
|
+
|
100
|
+
return str(Path(path).resolve())
|
@@ -0,0 +1,139 @@
|
|
1
|
+
"""
|
2
|
+
This module provides utility functions for tokenizing texts using spaCy.
|
3
|
+
It offers tools to process raw text into structured linguistic data, extract tokens and sentences, filter words by specific criteria (e.g., stop words, alphanumeric characters, part-of-speech), and
|
4
|
+
generate POS-tagged outputs.
|
5
|
+
|
6
|
+
Core functionalities include:
|
7
|
+
|
8
|
+
* Segemtating a text by applying a spaCy language model to raw text
|
9
|
+
* Extracting tokens and sentences from processed documents
|
10
|
+
* Removing stop words and non-alphanumeric tokens
|
11
|
+
* Filtering tokens by part-of-speech (POS) tags
|
12
|
+
* Generating (token, POS) tuples for downstream NLP tasks
|
13
|
+
|
14
|
+
The module is intended to provide tools for text segmentation and tagging.
|
15
|
+
"""
|
16
|
+
|
17
|
+
import spacy
|
18
|
+
|
19
|
+
from ..lib_types.spacy_types import *
|
20
|
+
from ..lib_types.dstk_types import Words, POSTaggedWordList, POSTaggedWord, WordSenteces
|
21
|
+
|
22
|
+
def apply_model(text: str, model: str | Language) -> Doc:
|
23
|
+
"""
|
24
|
+
Takes a text and analyzes it using a language model. It returns a processed version of the text that includes helpful information like the words, their meanings, and how they relate to each other.
|
25
|
+
|
26
|
+
:param text: The text to be processed.
|
27
|
+
:type text: str
|
28
|
+
:param model: The name of the model to be used or its instance.
|
29
|
+
:type model: str or Language
|
30
|
+
|
31
|
+
:return: A spaCy Doc object with linguistic annotations.
|
32
|
+
:rtype: Doc
|
33
|
+
"""
|
34
|
+
|
35
|
+
nlp: Language
|
36
|
+
|
37
|
+
if isinstance(model, str):
|
38
|
+
nlp = spacy.load(model)
|
39
|
+
else:
|
40
|
+
nlp = model
|
41
|
+
|
42
|
+
return nlp(text)
|
43
|
+
|
44
|
+
|
45
|
+
def get_tokens(document: Doc) -> Words[Token]:
|
46
|
+
"""
|
47
|
+
Returns a list of spaCy tokens from a Doc object.
|
48
|
+
|
49
|
+
:param docuument: A spaCy Doc object.
|
50
|
+
:type document: Doc
|
51
|
+
|
52
|
+
:return: A list of spaCy tokens.
|
53
|
+
:rtype: Words[Token]
|
54
|
+
"""
|
55
|
+
|
56
|
+
return [token for token in document]
|
57
|
+
|
58
|
+
|
59
|
+
def get_sentences(document: Doc) -> WordSenteces: # Check this one return type
|
60
|
+
"""
|
61
|
+
Returns a list of sentences from a spaCy Doc, where each sentence is represented as a list of spaCy Token objects.
|
62
|
+
|
63
|
+
:param document: A spaCy Doc object.
|
64
|
+
:type document: Doc
|
65
|
+
|
66
|
+
:return: A list of sentences, each sentence is a list of spaCy Tokens.
|
67
|
+
:rtype: WordSentences
|
68
|
+
"""
|
69
|
+
|
70
|
+
return [[token for token in sentence] for sentence in document.sents]
|
71
|
+
|
72
|
+
def remove_stop_words(tokens: Words[Token], custom_stop_words: list[str] | None = None) -> Words[Token]:
|
73
|
+
"""
|
74
|
+
Filters tokens, returning only alphanumeric tokens that are not stop words.
|
75
|
+
|
76
|
+
:param tokens: A list of spaCy tokens.
|
77
|
+
:type tokens: Words[Token]
|
78
|
+
:param custom_stop_words: If provided, a list of custom stop words. Defaults to None.
|
79
|
+
:type custom_stop_words: list[str] or None
|
80
|
+
|
81
|
+
:return: A list of spaCy tokens.
|
82
|
+
:rtype: Words[Token]
|
83
|
+
"""
|
84
|
+
|
85
|
+
lower_stop_words: list[str]
|
86
|
+
|
87
|
+
if custom_stop_words:
|
88
|
+
lower_stop_words = [word.lower() for word in custom_stop_words]
|
89
|
+
|
90
|
+
return [
|
91
|
+
token for token in tokens
|
92
|
+
if token.is_alpha and not token.is_stop and
|
93
|
+
(custom_stop_words is None or token.text.lower() not in lower_stop_words)
|
94
|
+
]
|
95
|
+
|
96
|
+
def alphanumeric_raw_tokenizer(tokens: Words[Token]) -> Words[Token]:
|
97
|
+
"""
|
98
|
+
Tokenizes a text including only alphanumeric characters and stop words.
|
99
|
+
|
100
|
+
:param tokens: A list of spaCy tokens.
|
101
|
+
:type tokens: Words[Token]
|
102
|
+
|
103
|
+
:return: A list of spaCy tokens.
|
104
|
+
:rtype: Words[Token]
|
105
|
+
"""
|
106
|
+
|
107
|
+
return [
|
108
|
+
token
|
109
|
+
for token in tokens
|
110
|
+
if token.text.isalpha()
|
111
|
+
]
|
112
|
+
|
113
|
+
def filter_by_pos(tokens: Words[Token], pos: str) -> Words[Token]:
|
114
|
+
"""
|
115
|
+
Returns a list of spaCy tokens filtered by a spacific part-of-speech tag.
|
116
|
+
|
117
|
+
:param tokens: A list of spaCy tokens.
|
118
|
+
:type tokens: Words[Token]
|
119
|
+
:param pos: The POS tag to filter by (e.g., 'NOUN', 'VERB', etc.). Case-sensitive.
|
120
|
+
:type pos: str
|
121
|
+
|
122
|
+
:return: A list of spaCy tokens.
|
123
|
+
:rtype: Words[Token]
|
124
|
+
"""
|
125
|
+
|
126
|
+
return [token for token in tokens if token.pos_ == pos]
|
127
|
+
|
128
|
+
def pos_tagger(tokens: Words[Token]) -> POSTaggedWordList:
|
129
|
+
"""
|
130
|
+
Returns a list of (Token, POS) tuples, pairing each token with its part-of-speech tag.
|
131
|
+
|
132
|
+
:param tokens: A list of spaCy tokens.
|
133
|
+
:type tokens: Words[Token]
|
134
|
+
|
135
|
+
:return: A list of POSTaggedWord tuples.
|
136
|
+
:rtype: POSTaggedWordList
|
137
|
+
"""
|
138
|
+
|
139
|
+
return [POSTaggedWord(token, token.pos_) for token in tokens]
|