dstklib 1.0.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. dstk/__init__.py +10 -12
  2. dstk/adaptors/__init__.py +2 -0
  3. dstk/adaptors/adaptors.py +91 -0
  4. dstk/adaptors/typeguards.py +141 -0
  5. dstk/hooks/__init__.py +2 -0
  6. dstk/hooks/hook_tools.py +89 -0
  7. dstk/hooks/type_conversion.py +40 -0
  8. dstk/lib_types/__init__.py +2 -3
  9. dstk/lib_types/dstk_types.py +188 -16
  10. dstk/lib_types/plotly_types.py +1 -0
  11. dstk/method_index.py +32 -0
  12. dstk/models/__init__.py +2 -0
  13. dstk/models/model_tools.py +83 -0
  14. dstk/models/models.py +191 -0
  15. dstk/modules/__init__.py +10 -0
  16. dstk/modules/count_models.py +91 -0
  17. dstk/modules/data_visualization/__init__.py +2 -0
  18. dstk/modules/data_visualization/clustering.py +129 -0
  19. dstk/modules/data_visualization/embeddings.py +101 -0
  20. dstk/modules/geometric_distance.py +114 -0
  21. dstk/modules/ngrams.py +156 -0
  22. dstk/modules/predict_models.py +109 -0
  23. dstk/modules/text_matrix_builder.py +55 -0
  24. dstk/modules/text_processor.py +100 -0
  25. dstk/modules/tokenizer.py +139 -0
  26. dstk/modules/weight_matrix.py +65 -0
  27. dstk/templates/__init__.py +2 -0
  28. dstk/templates/rules.py +59 -0
  29. dstk/templates/templates.py +231 -0
  30. dstk/workflows/__init__.py +2 -0
  31. dstk/workflows/stage_workflows.py +55 -0
  32. dstk/workflows/workflow_tools.py +383 -0
  33. dstklib-2.0.1.dist-info/METADATA +377 -0
  34. dstklib-2.0.1.dist-info/RECORD +43 -0
  35. dstk/collocations.py +0 -121
  36. dstk/count_models.py +0 -112
  37. dstk/geometric_distance.py +0 -107
  38. dstk/lib_types/matplotlib_types.py +0 -4
  39. dstk/lib_types/nltk_types.py +0 -1
  40. dstk/matrix_base.py +0 -113
  41. dstk/pipeline_tools.py +0 -27
  42. dstk/pipelines.py +0 -114
  43. dstk/plot_embeddings.py +0 -240
  44. dstk/predict_models.py +0 -189
  45. dstk/text_matrix_builder.py +0 -87
  46. dstk/text_processor.py +0 -450
  47. dstk/weight_matrix.py +0 -71
  48. dstk/workflow_tools.py +0 -257
  49. dstklib-1.0.2.dist-info/METADATA +0 -369
  50. dstklib-1.0.2.dist-info/RECORD +0 -28
  51. {dstklib-1.0.2.dist-info → dstklib-2.0.1.dist-info}/LICENSE +0 -0
  52. {dstklib-1.0.2.dist-info → dstklib-2.0.1.dist-info}/WHEEL +0 -0
  53. {dstklib-1.0.2.dist-info → dstklib-2.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,114 @@
1
+ """
2
+ This module provides functions to compute geometric distance and similarity measures between word embeddings, enabling semantic comparison of words in vector space.
3
+
4
+ Available metrics include:
5
+
6
+ * Euclidean distance
7
+ * Manhattan distance
8
+ * Cosine similarity
9
+
10
+ Additionally, it offers a method to find the nearest semantic neighbors of a given word based on specified distance or similarity metrics using scikit-learn's NearestNeighbors.
11
+
12
+ All functions operate on word embeddings represented as Pandas DataFrames indexed by words, facilitating easy integration with common NLP and Computational Linguistic workflows.
13
+ """
14
+
15
+ from sklearn.neighbors import NearestNeighbors
16
+ import numpy as np
17
+ from sklearn.metrics.pairwise import cosine_similarity
18
+
19
+ from ..lib_types import ndarray, Series, DataFrame, Neighbors, Neighbor
20
+
21
+ def euclidean_distance(embeddings: DataFrame, first_word: str, second_word: str) -> float:
22
+ """
23
+ Computes the Euclidean distance between the embeddings of two words.
24
+
25
+ :param embeddings: A dataframe containing the word embeddings.
26
+ :type embeddings: DataFrame
27
+ :param first_word: The first word in the pair.
28
+ :type first_word: str
29
+ :param second_word: The second word in the pair.
30
+ :type second_word: str
31
+
32
+ :returns: The Euclidean distance between the first and second word.
33
+ :rtype: float
34
+ """
35
+
36
+ first_word_vector: Series = embeddings.loc[first_word]
37
+ second_word_vector: Series = embeddings.loc[second_word]
38
+
39
+ return float(np.linalg.norm(first_word_vector - second_word_vector))
40
+
41
+ def manhattan_distance(embeddings: DataFrame, first_word: str, second_word: str) -> float:
42
+ """
43
+ Computes the Manhattan distance between the embeddings of two words.
44
+
45
+ :param embeddings: A dataframe containing the word embeddings.
46
+ :type embeddings: DataFrame
47
+ :param first_word: The first word in the pair.
48
+ :type first_word: str
49
+ :param second_word: The second word in the pair.
50
+ :type second_word: str
51
+
52
+ :returns: The Manhattan distance between the first and second word.
53
+ :rtype: float
54
+ """
55
+
56
+ first_word_vector: Series = embeddings.loc[first_word]
57
+ second_word_vector: Series = embeddings.loc[second_word]
58
+
59
+ return np.sum(np.abs(first_word_vector - second_word_vector))
60
+
61
+ def cos_similarity(embeddings: DataFrame, first_word: str, second_word: str) -> float:
62
+ """
63
+ Computes the cosine similarity between the embeddings of two words.
64
+
65
+ :param embeddings: A dataframe containing the word embeddings.
66
+ :type embeddings: DataFrame
67
+ :param first_word: The first word in the pair.
68
+ :type first_word: str
69
+ :param second_word: The second word in the pair.
70
+ :type second_word: str
71
+
72
+ :returns: The cosine similarity between the first and second word.
73
+ :rtype: float
74
+ """
75
+
76
+ first_word_vector: ndarray = np.array(embeddings.loc[first_word]).reshape(1, -1)
77
+ second_word_vector: ndarray = np.array(embeddings.loc[second_word]).reshape(1, -1)
78
+
79
+ cos_sim: ndarray = cosine_similarity(first_word_vector, second_word_vector)
80
+
81
+ return cos_sim[0][0]
82
+
83
+ def nearest_neighbors(embeddings: DataFrame, word: str, metric: str = "cosine", n_words: int = 5, **kwargs) -> Neighbors:
84
+ """
85
+ Returns the top N most semantically similar words to a given target word, based on the specified distance or similarity metric.
86
+
87
+ :param embeddings: A dataframe containing the word embeddings.
88
+ :type embeddings: DataFrame
89
+ :param word: The target word to find neighbors for.
90
+ :type word: str
91
+ :param metric: The distance or similarity metric to use (e.g., 'cosine', 'euclidean'). Defaults to 'cosine'.
92
+ :type metric: str
93
+ :param n_words: Number of nearest neighbors to return. Defaults to 5.
94
+ :type of n_words: int
95
+ :param kwargs: Additional keyword arguments to pass to sklearn's NearestNeighbors. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
96
+
97
+ :returns: A list of `Neighbor` namedtuples, one for each word close to the target word.
98
+ :rtype: Neighbors
99
+ """
100
+
101
+ neighbors: NearestNeighbors = NearestNeighbors(n_neighbors=n_words, algorithm="auto", metric=metric, **kwargs)
102
+ neighbors.fit(embeddings.to_numpy())
103
+
104
+ word_vector: Series = embeddings.loc[word]
105
+
106
+ distances: ndarray
107
+ indices: ndarray
108
+ distances, indices = neighbors.kneighbors([word_vector], n_neighbors=n_words + 1)
109
+
110
+ neighbor_tuples = zip(indices[0], distances[0])
111
+
112
+ results: Neighbors = [Neighbor(embeddings.index[index], 1 - distance) for index, distance in neighbor_tuples if embeddings.index[index] != word]
113
+
114
+ return results
dstk/modules/ngrams.py ADDED
@@ -0,0 +1,156 @@
1
+ """
2
+ This module provides utilities for extracting context-based collocates, bigrams, and n-grams from a list of words or tokens. It is designed to support both raw string tokens and spaCy `Token` objects, allowing for flexibility in preprocessing pipelines.
3
+
4
+ The functions in this module focus on identifying co-occurrence patterns around a specific target word, as well as extracting fixed-length n-grams from sequences of tokens. This is useful for tasks such as collocation analysis, feature engineering for machine learning models, and exploratory corpus analysis.
5
+
6
+ Core functionalities include:
7
+
8
+ * Extracting left and right context windows around a target word
9
+ * Creating directed and undirected bigrams centered on a target
10
+ * Generating fixed-length n-grams from a sequence of words
11
+ * Counting the frequency of collocated words in context windows
12
+
13
+ The module is compatible with both plain string tokens and spaCy Tokens.
14
+ """
15
+
16
+
17
+ from collections import Counter
18
+ import pandas as pd
19
+
20
+ from typing import Generator
21
+ from ..lib_types import Words, WordCounts, DataFrame, CollocatesList, BigramList, DirectedCollocateList, Token, Bigram
22
+
23
+
24
+ def _find_contexts(words: Words, target_word: str, window_size: tuple[int, int]) -> Generator[tuple[Words, Words], None, None]:
25
+ """
26
+ Yields left and right contexts for each occurrence of a target word in a list of words.
27
+
28
+ :param words: A list of words (strings or spaCy Tokens).
29
+ :type words: Words
30
+
31
+ :param target_word: The word to find within the list.
32
+ :type target_word: str
33
+
34
+ :param window_size: A tuple representing the number of words to include before and after the target word.
35
+ :type window_size: tuple[int, int]
36
+
37
+ :return: An Geberator of tuples, where each tuple contains the left and right context around a matched word.
38
+ :rtype: Generator[tuple[Words, Words], None, None]
39
+ """
40
+
41
+ for index, word in enumerate(words):
42
+ word_to_compare = word.text if isinstance(word, Token) else word
43
+ if word_to_compare == target_word:
44
+ start: int = max(0, index - window_size[0])
45
+ end: int = min(len(words), index + window_size[1] + 1)
46
+
47
+ left_context: Words = words[start:index]
48
+ right_context: Words = words[index + 1:end]
49
+
50
+ yield (left_context, right_context)
51
+
52
+ def extract_collocates(words: Words, target_word: str, window_size: tuple[int, int]) -> CollocatesList:
53
+ """
54
+ Extracts the context words of the target word, returned as tuples whose lenght corresponds to the specified window_size.
55
+
56
+ :param words: A list of spaCy tokens or words represented as strings.
57
+ :type words: Words
58
+ :param target_word: The word to find within the list.
59
+ :type target_word: str
60
+ :param window_size: A tuple indicating how many words to capture to the left and right of the target.
61
+ :type window_size: tuple[int, int]
62
+
63
+ :returns: A list of collocates (left and right context words) of the target word.
64
+ :rtype: CollocatesList
65
+ """
66
+
67
+ return [tuple(left + right) for left, right in _find_contexts(words, target_word, window_size)]
68
+
69
+ def extract_directed_bigrams(words: Words, target_word: str, window_size: tuple[int, int]) -> DirectedCollocateList:
70
+ """
71
+ Extracts directed bigrams (left and right context words) around a target word.
72
+
73
+ For each occurrence of `target_word` in the input `words`, this function collects two types of bigrams:
74
+ * Left bigrams: (context_word, ("L", target_word))
75
+ * Right bigrams: (context_word, ("R", target_word))
76
+
77
+ :param words: A list of spaCy tokens or words represented as strings.
78
+ :type words: Words
79
+
80
+ :param target_word: The word to search for in the list.
81
+ :type target_word: str
82
+
83
+ :param window_size: A tuple indicating how many words to capture to the left and right of the target.
84
+ :type window_size: tuple[int, int]
85
+
86
+ :return: A list of directed bigrams in the form `(word, ("L" | "R", target_word))`.
87
+ :rtype: DirectedCollocateList
88
+ """
89
+ bigrams: DirectedCollocateList = []
90
+
91
+ for left, right in _find_contexts(words, target_word, window_size):
92
+ bigrams.extend([(word, ("L", target_word)) for word in left])
93
+ bigrams.extend([(word, ("R", target_word)) for word in right])
94
+
95
+ return bigrams
96
+
97
+ def extract_undirected_bigrams(words: Words, target_word: str, window_size: tuple[int, int]) -> BigramList:
98
+ """
99
+ Extracts undirected bigrams surrounding a target word.
100
+
101
+ For each occurrence of `target_word`, this function collects all context words within the specified window (both left and right), and forms a `Bigram` with:
102
+
103
+ * `collocate`: the context word
104
+ * `target_word`: the target word
105
+
106
+ :param words: A list of spaCy tokens or words represented as strings.
107
+ :type words: Words
108
+
109
+ :param target_word: The word to search for in the list.
110
+ :type target_word: str
111
+
112
+ :param window_size: A tuple indicating how many words to capture to the left and right of the target.
113
+ :type window_size: tuple[int, int]
114
+
115
+ :return: A list of `Bigram` namedtuples, one for each context word around each target occurrence.
116
+ :rtype: BigramList
117
+ """
118
+ bigrams: BigramList = []
119
+
120
+ for left, right in _find_contexts(words, target_word, window_size):
121
+ bigrams.extend([Bigram(collocate=word, target_word=target_word) for word in left + right])
122
+
123
+ return bigrams
124
+
125
+ def extract_ngrams(words: Words, window_size: int) -> CollocatesList:
126
+ """
127
+ Splits the tokens into groups of window_size consecutive words and joins each group into a string.
128
+
129
+ :param words: A list of spaCy tokens or words represented as strings.
130
+ :type words: Words
131
+ :param window_size: size of the square context window.
132
+ :type window_size: int
133
+
134
+ :return: A list of tuples, where each tuple contains `window_size` consecutive words from the input.
135
+ :rtype: CollocatesList
136
+ """
137
+
138
+ ngrams: CollocatesList = [tuple(words[index:index + window_size]) for index in range(len(words) - window_size + 1)]
139
+ return ngrams
140
+
141
+ def count_collocates(collocates: CollocatesList) -> DataFrame:
142
+ """
143
+ Counts the frequency of words in a list of collocations and returns the result as a DataFrame.
144
+
145
+ :param collocates: A list of collocations, where each collocation is a tuple of words.
146
+ :type collocates: CollocatesList
147
+
148
+ :return: A DataFrame with two columns: "word" and "count", sorted by frequency.
149
+ :rtype: DataFrame
150
+ """
151
+
152
+ all_words: Words = [word.text if isinstance(word, Token) else word for collocation in collocates for word in collocation]
153
+ word_counts: WordCounts = Counter(all_words)
154
+ word_counts_df: DataFrame = pd.DataFrame(word_counts.items(), columns=["word", "count"])
155
+
156
+ return word_counts_df
@@ -0,0 +1,109 @@
1
+ """
2
+ This module provides utilities to train, save, and load word embedding models using neural networks models such as Word2Vec (gensim) and FastText (fasttext library).
3
+
4
+ Functions include:
5
+
6
+ * *word2vec:* Train Word2Vec embeddings from a corpus file.
7
+ * *fastText:* Train FastText embeddings from a corpus file.
8
+ * *load_model:* Load a saved model from disk (supports Word2Vec .model and FastText .bin formats).
9
+ * *save_model:* Save a trained model to disk in the appropriate format.
10
+
11
+ Each function supports passing additional keyword arguments to fine-tune training and loading.
12
+ """
13
+
14
+ from gensim.models import Word2Vec
15
+ import fasttext
16
+ from pathlib import Path
17
+
18
+ from ..lib_types import FastText, NeuralModels
19
+
20
+ def word2vec(path: str, **kwargs) -> Word2Vec:
21
+ """
22
+ Creates word embeddings using the Word2Vec algorithm.
23
+
24
+ :param path: The path to a file conatining a list of sentences or collocations from which to build word embeddings.
25
+ :type path: str
26
+ :param kwargs: Additional keyword arguments to pass to gensim.models.Word2Vec. Common options include:
27
+
28
+ * **vector_size:** Size of the word embedding vectors.
29
+ * **workers:** Number of CPU cores to be used during the training process.
30
+ * **sg:** Training algorithm. 1 for skip-gram; 0 for CBOW (Continuous Bag of Words).
31
+ * **window (int):** Maximum distance between the current and predicted word.
32
+ * **min_count (int):** Ignores all words with total frequency lower than this.
33
+
34
+ For more information check: https://radimrehurek.com/gensim/models/word2vec.html
35
+
36
+ :returns: An instance of gensim's Word2Vec.
37
+ :rtype: Word2Vec
38
+ """
39
+
40
+ return Word2Vec(
41
+ corpus_file=path,
42
+ **kwargs
43
+ )
44
+
45
+ def fastText(path: str, **kwargs) -> FastText:
46
+ """
47
+ Creates word embeddings using the FastText algorithm.
48
+
49
+ :param path: The path to a file containing a list of sentences or collocations from which to build word embeddings.
50
+ :type path: str
51
+ :param kwargs: Additional keyword arguments to pass to fasttext.train_unsupervised. Common options include:
52
+
53
+ * **dim:** Size of the word embedding vectors.
54
+ * **model:** Training algorithm: skipgram or cbow (Continuous Bag of Words)
55
+ * **thread:** Number of CPU cores to be used during the training process.
56
+
57
+ For more information check: https://fasttext.cc/docs/en/options.html
58
+
59
+ :returns: An instance of fasttext's FastText.
60
+ :rtype: FastText
61
+ """
62
+
63
+ return fasttext.train_unsupervised(
64
+ path,
65
+ **kwargs
66
+ )
67
+
68
+ def load_model(path: str) -> NeuralModels:
69
+ """
70
+ Loads the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.
71
+
72
+ :param path: Path to the saved model file.
73
+ :type path: str
74
+
75
+ :returns: An instance of gensim's Word2Vec or fasttext's FastText.
76
+ :rtype: NeuralModels
77
+ """
78
+
79
+ extension: str = Path(path).suffix.lower()
80
+
81
+ if extension == ".model":
82
+ return Word2Vec.load(path)
83
+ elif extension == ".bin":
84
+ return fasttext.load_model(path)
85
+ else:
86
+ raise ValueError(f"Model extension {extension} not recognized.")
87
+
88
+ def save_model(model: NeuralModels, path: str) -> str:
89
+ """
90
+ Saves the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.
91
+
92
+ :param model: A trained Word2Vec or FastText model.
93
+ :type model: NeuralModels
94
+ :param path: The path (without extension) where to save the model.
95
+ :type path: str
96
+
97
+ :returns: An instance of gensim's Word2Vec or fasttext's FastText.
98
+ :rtype: NeuralModels
99
+ """
100
+ full_path: Path = Path(path)
101
+
102
+ if isinstance(model, Word2Vec):
103
+ model.save(str(full_path.with_suffix(".model")))
104
+ elif isinstance(model, FastText):
105
+ model.save_model(str(full_path.with_suffix(".bin")))
106
+ else:
107
+ raise NotImplementedError(f"Model identifier type {type(model.__name__)} not yet supported")
108
+
109
+ return str(full_path.resolve())
@@ -0,0 +1,55 @@
1
+ """
2
+ This module provides functions to construct common matrix representations used in text analysis and natural language processing.
3
+
4
+ Key features include:
5
+
6
+ * Creating a Document-Term Matrix (DTM) from a corpus of text, leveraging sklearn's CountVectorizer with customizable parameters such as stop word removal and n-gram range.
7
+ * Generating a Co-occurrence Matrix from a given Document-Term Matrix, capturing how frequently terms co-occur across documents.
8
+
9
+ These matrices are foundational for many NLP and Computational Linguistics tasks, including topic modeling, word embedding training, and network analysis. The output is provided as Pandas DataFrames for ease of analysis and integration with data science workflows.
10
+ """
11
+
12
+ from sklearn.feature_extraction.text import CountVectorizer
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+ from ..lib_types import csr_matrix, DataFrame, ndarray
17
+
18
+ def create_dtm(corpus: list[str], **kwargs) -> DataFrame:
19
+ """
20
+ Creates Document Term Matrix (DTM).
21
+
22
+ :param corpus: A list of sentences or collocations from which to build a matrix.
23
+ :type corpus: list[str]
24
+ :param kwargs: Additional keyword arguments to pass to sklearn's CountVectorizer. Common options include:
25
+
26
+ * **stop_words:** If provided, a list of stopwords to remove from the corpus.
27
+ * **ngram_range:** A tuple (min_n, max_n) specifying the range of n-grams to consider.
28
+
29
+ For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
30
+
31
+ :return: A Document Term Matrix (DTM).
32
+ :rtype: DataFrame.
33
+ """
34
+
35
+ vectorizer: CountVectorizer = CountVectorizer(**kwargs)
36
+
37
+ dtm: csr_matrix = vectorizer.fit_transform(corpus)
38
+
39
+ return pd.DataFrame(dtm.toarray(), index=np.array(corpus), columns=vectorizer.get_feature_names_out())
40
+
41
+ def create_co_occurrence_matrix(dtm: DataFrame) -> DataFrame:
42
+ """
43
+ Creates a Co-occurrence matrix from a Document Term Matrix (DTM).
44
+
45
+ :param dtm: A Document Term Matrix (DTM) from which to build a Co-occurrence matrix.
46
+ :type dtm: DataFrame
47
+
48
+ :return: A Co-occurrence matrix.
49
+ :rtype: DataFrame
50
+ """
51
+ matrix: ndarray = dtm.to_numpy()
52
+
53
+ co_matrix: ndarray = matrix.T @ matrix
54
+
55
+ return pd.DataFrame(co_matrix, index=dtm.columns, columns=dtm.columns)
@@ -0,0 +1,100 @@
1
+ """
2
+ This module provides utility functions for processing tokenized or lemmatized text represented as lists of strings
3
+ or POS-tagged tuples. It supports common text normalization and transformation tasks, such as lowercasing,
4
+ vocabulary extraction, and joining tokens into a single string. Additionally, it includes functionality for saving
5
+ processed text or tagged data to a file in plain text or CSV format.
6
+
7
+ Core functionalities include:
8
+
9
+ * Converting spaCy tokens to strings (with optional lemmatization)
10
+ * Lowercasing and vocabulary extraction
11
+ * Joining word lists into full text strings
12
+ * Saving word lists or (token, POS) pairs to disk in a consistent format
13
+
14
+ This module is useful for preparing text data for further analysis, modeling, or storage.
15
+ """
16
+
17
+ from pathlib import Path
18
+
19
+ from ..lib_types.dstk_types import Words, POSTaggedWordList, Token, POSTaggedWord
20
+
21
+ def tokens_to_text(tokens: Words[Token], lemmatize: bool = False) -> Words[str]:
22
+ """
23
+ Converts a list of spaCy Token objects to a list of words represented as strings.
24
+
25
+ :param tokens: A list of spaCy tokens.
26
+ :type tokens: Words[Token]
27
+ :param lemmatize: Whether to return the lemmatized form of each token. Defaults to False.
28
+ :type lemmatize: bool
29
+
30
+ :return: A list words represented as strings.
31
+ :rtype: Words[str]
32
+ """
33
+
34
+ return [token.lemma_.lower() if lemmatize else token.text for token in tokens]
35
+
36
+ def to_lower(words: Words[str]) -> Words[str]:
37
+ """
38
+ Returns a list of lower cased words.
39
+
40
+ :param words: A list words represented as strings.
41
+ :type words: Words[str]
42
+
43
+ :return: A list of words represented as strings.
44
+ :rtype: Words[str]
45
+ """
46
+
47
+ return [word.lower() for word in words]
48
+
49
+ def get_vocabulary(words: Words[str]) -> Words[str]:
50
+ """
51
+ Returns the vocabulary a text.
52
+
53
+ :param words: A list words represented as strings.
54
+ :type words: Words[str]
55
+
56
+ :return: A list of words represented as strings.
57
+ :rtype: Words[str]
58
+ """
59
+
60
+ return sorted(set(words))
61
+
62
+ def join(words: Words[str]) -> str:
63
+ """
64
+ Joins a list of strings into a single string text.
65
+
66
+ :param words: A list words represented as strings.
67
+ :type words: Words[str]
68
+
69
+ :return: A single string formed by concatenating the input words separated by spaces.
70
+ :rtype: Words[str]
71
+ """
72
+
73
+ return " ".join(words)
74
+
75
+ def save_to_file(words: Words[str] | POSTaggedWordList, path: str) -> str:
76
+ """
77
+ Saves a list of strings or (Token, POS) tuples in the specified path. If tokens is a list of strings, it saves each string in a new line. If it is a list of tuples, it saves each tuple in a new line as a pair or values separated by a comma, in a CSV format.
78
+
79
+ :param words: A list words represented as strings or a list of POSTaggedWord tuples.
80
+ :type words: Words[str] or POSTaggedWordList.
81
+ :param path: The path where to save the list of words.
82
+ :type path: str
83
+
84
+ :return: The path where the file was saved.
85
+ :rtype: str
86
+ """
87
+
88
+ with open(path, "w") as file:
89
+ for word in words:
90
+ if type(word) == str:
91
+ file.write(word + "\n")
92
+ elif isinstance(word, POSTaggedWord):
93
+ if isinstance(word[0], str):
94
+ file.write(word[0] + "," + word[1] + "\n")
95
+ else:
96
+ raise ValueError("You can only use save_to_file with a POSTaggedWordList if word is of type of str.")
97
+ else:
98
+ raise ValueError("You can only use save_to_file with Words[srt] | POSTaggedWordList")
99
+
100
+ return str(Path(path).resolve())
@@ -0,0 +1,139 @@
1
+ """
2
+ This module provides utility functions for tokenizing texts using spaCy.
3
+ It offers tools to process raw text into structured linguistic data, extract tokens and sentences, filter words by specific criteria (e.g., stop words, alphanumeric characters, part-of-speech), and
4
+ generate POS-tagged outputs.
5
+
6
+ Core functionalities include:
7
+
8
+ * Segemtating a text by applying a spaCy language model to raw text
9
+ * Extracting tokens and sentences from processed documents
10
+ * Removing stop words and non-alphanumeric tokens
11
+ * Filtering tokens by part-of-speech (POS) tags
12
+ * Generating (token, POS) tuples for downstream NLP tasks
13
+
14
+ The module is intended to provide tools for text segmentation and tagging.
15
+ """
16
+
17
+ import spacy
18
+
19
+ from ..lib_types.spacy_types import *
20
+ from ..lib_types.dstk_types import Words, POSTaggedWordList, POSTaggedWord, WordSenteces
21
+
22
+ def apply_model(text: str, model: str | Language) -> Doc:
23
+ """
24
+ Takes a text and analyzes it using a language model. It returns a processed version of the text that includes helpful information like the words, their meanings, and how they relate to each other.
25
+
26
+ :param text: The text to be processed.
27
+ :type text: str
28
+ :param model: The name of the model to be used or its instance.
29
+ :type model: str or Language
30
+
31
+ :return: A spaCy Doc object with linguistic annotations.
32
+ :rtype: Doc
33
+ """
34
+
35
+ nlp: Language
36
+
37
+ if isinstance(model, str):
38
+ nlp = spacy.load(model)
39
+ else:
40
+ nlp = model
41
+
42
+ return nlp(text)
43
+
44
+
45
+ def get_tokens(document: Doc) -> Words[Token]:
46
+ """
47
+ Returns a list of spaCy tokens from a Doc object.
48
+
49
+ :param docuument: A spaCy Doc object.
50
+ :type document: Doc
51
+
52
+ :return: A list of spaCy tokens.
53
+ :rtype: Words[Token]
54
+ """
55
+
56
+ return [token for token in document]
57
+
58
+
59
+ def get_sentences(document: Doc) -> WordSenteces: # Check this one return type
60
+ """
61
+ Returns a list of sentences from a spaCy Doc, where each sentence is represented as a list of spaCy Token objects.
62
+
63
+ :param document: A spaCy Doc object.
64
+ :type document: Doc
65
+
66
+ :return: A list of sentences, each sentence is a list of spaCy Tokens.
67
+ :rtype: WordSentences
68
+ """
69
+
70
+ return [[token for token in sentence] for sentence in document.sents]
71
+
72
+ def remove_stop_words(tokens: Words[Token], custom_stop_words: list[str] | None = None) -> Words[Token]:
73
+ """
74
+ Filters tokens, returning only alphanumeric tokens that are not stop words.
75
+
76
+ :param tokens: A list of spaCy tokens.
77
+ :type tokens: Words[Token]
78
+ :param custom_stop_words: If provided, a list of custom stop words. Defaults to None.
79
+ :type custom_stop_words: list[str] or None
80
+
81
+ :return: A list of spaCy tokens.
82
+ :rtype: Words[Token]
83
+ """
84
+
85
+ lower_stop_words: list[str]
86
+
87
+ if custom_stop_words:
88
+ lower_stop_words = [word.lower() for word in custom_stop_words]
89
+
90
+ return [
91
+ token for token in tokens
92
+ if token.is_alpha and not token.is_stop and
93
+ (custom_stop_words is None or token.text.lower() not in lower_stop_words)
94
+ ]
95
+
96
+ def alphanumeric_raw_tokenizer(tokens: Words[Token]) -> Words[Token]:
97
+ """
98
+ Tokenizes a text including only alphanumeric characters and stop words.
99
+
100
+ :param tokens: A list of spaCy tokens.
101
+ :type tokens: Words[Token]
102
+
103
+ :return: A list of spaCy tokens.
104
+ :rtype: Words[Token]
105
+ """
106
+
107
+ return [
108
+ token
109
+ for token in tokens
110
+ if token.text.isalpha()
111
+ ]
112
+
113
+ def filter_by_pos(tokens: Words[Token], pos: str) -> Words[Token]:
114
+ """
115
+ Returns a list of spaCy tokens filtered by a spacific part-of-speech tag.
116
+
117
+ :param tokens: A list of spaCy tokens.
118
+ :type tokens: Words[Token]
119
+ :param pos: The POS tag to filter by (e.g., 'NOUN', 'VERB', etc.). Case-sensitive.
120
+ :type pos: str
121
+
122
+ :return: A list of spaCy tokens.
123
+ :rtype: Words[Token]
124
+ """
125
+
126
+ return [token for token in tokens if token.pos_ == pos]
127
+
128
+ def pos_tagger(tokens: Words[Token]) -> POSTaggedWordList:
129
+ """
130
+ Returns a list of (Token, POS) tuples, pairing each token with its part-of-speech tag.
131
+
132
+ :param tokens: A list of spaCy tokens.
133
+ :type tokens: Words[Token]
134
+
135
+ :return: A list of POSTaggedWord tuples.
136
+ :rtype: POSTaggedWordList
137
+ """
138
+
139
+ return [POSTaggedWord(token, token.pos_) for token in tokens]