rp-segmentation 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ from rp_segmentation.nltk_resources import ensure_required_nltk_resources
6
+ from rp_segmentation.segmenters import (
7
+ get_tokens,
8
+ n_stop_words_segmentation,
9
+ paragraph_segmentation,
10
+ sentence_segmentation,
11
+ )
12
+
13
+ try:
14
+ __version__ = version("rp-segmentation")
15
+ except PackageNotFoundError:
16
+ __version__ = "0.1.0"
17
+
18
+
19
+ __all__ = (
20
+ "__version__",
21
+ "ensure_required_nltk_resources",
22
+ "get_tokens",
23
+ "sentence_segmentation",
24
+ "paragraph_segmentation",
25
+ "n_stop_words_segmentation",
26
+ )
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class RPSegmentationError(Exception):
5
+ """
6
+ Base exception for all errors raised by the rp_segmentation package.
7
+
8
+ This exception can be used to catch any package-specific error without
9
+ catching unrelated built-in Python exceptions.
10
+ """
11
+
12
+
13
+ class InvalidSegmentationParameterError(RPSegmentationError):
14
+ """
15
+ Raised when an invalid segmentation parameter is provided.
16
+
17
+ Examples include empty language values, unsupported parameter values,
18
+ non-positive segmentation thresholds, or texts that cannot be segmented
19
+ according to the selected strategy.
20
+ """
21
+
22
+
23
+ class NLTKResourceError(RPSegmentationError):
24
+ """
25
+ Raised when a required external NLTK resource is missing or cannot be loaded.
26
+
27
+ This exception is used when the package cannot find, download, or access
28
+ resources such as tokenizers or stopword corpora required for text
29
+ processing.
30
+ """
31
+
32
+
33
+ __all__ = (
34
+ "RPSegmentationError",
35
+ "InvalidSegmentationParameterError",
36
+ "NLTKResourceError",
37
+ )
@@ -0,0 +1,123 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Mapping
4
+ from functools import cache
5
+ from types import MappingProxyType
6
+ from typing import Final, NoReturn
7
+ from zipfile import BadZipFile
8
+
9
+ import nltk
10
+
11
+ from rp_segmentation.exceptions import NLTKResourceError
12
+
13
+ NLTK_RESOURCES: Final[Mapping[str, str]] = MappingProxyType(
14
+ {
15
+ "punkt_tab": "tokenizers/punkt_tab",
16
+ "stopwords": "corpora/stopwords",
17
+ }
18
+ )
19
+
20
+
21
+ def _supported_resources_message() -> str:
22
+ return ", ".join(sorted(NLTK_RESOURCES))
23
+
24
+
25
+ def _recovery_hint(resource_name: str) -> str:
26
+ return f"Try running: python -m nltk.downloader {resource_name}"
27
+
28
+
29
+ def _raise_corrupted_resource_error(
30
+ resource_name: str,
31
+ exc: BadZipFile,
32
+ *,
33
+ after_download: bool = False,
34
+ ) -> NoReturn:
35
+ moment = " after download" if after_download else ""
36
+
37
+ raise NLTKResourceError(
38
+ f"The NLTK resource appears to be corrupted{moment}: {resource_name}. "
39
+ f"Try deleting the local NLTK resource and running: "
40
+ f"python -m nltk.downloader {resource_name}"
41
+ ) from exc
42
+
43
+
44
+ def _resource_exists(resource_name: str, resource_path: str) -> bool:
45
+ try:
46
+ nltk.data.find(resource_path)
47
+ return True
48
+
49
+ except LookupError:
50
+ return False
51
+
52
+ except BadZipFile as exc:
53
+ _raise_corrupted_resource_error(resource_name, exc)
54
+
55
+
56
+ @cache
57
+ def ensure_nltk_resource(resource_name: str) -> None:
58
+ """
59
+ Ensures that a required NLTK resource is available.
60
+
61
+ If the resource is missing, the function attempts to download it
62
+ automatically. If the resource is corrupted or cannot be downloaded,
63
+ a package-specific error is raised with a clear recovery message.
64
+
65
+ Parameters
66
+ ----------
67
+ resource_name:
68
+ Name of the NLTK resource. Example: 'punkt_tab'.
69
+
70
+ Raises
71
+ ------
72
+ NLTKResourceError
73
+ If the resource is unsupported, missing, corrupted, or cannot be
74
+ downloaded.
75
+ """
76
+ resource_path = NLTK_RESOURCES.get(resource_name)
77
+
78
+ if resource_path is None:
79
+ raise NLTKResourceError(
80
+ f"Unsupported NLTK resource for rp_segmentation: {resource_name}. "
81
+ f"Supported resources are: {_supported_resources_message()}."
82
+ )
83
+
84
+ if _resource_exists(resource_name, resource_path):
85
+ return
86
+
87
+ try:
88
+ downloaded = nltk.download(resource_name, quiet=True)
89
+
90
+ except BadZipFile as exc:
91
+ _raise_corrupted_resource_error(resource_name, exc, after_download=True)
92
+
93
+ except Exception as exc:
94
+ raise NLTKResourceError(
95
+ f"Could not download the NLTK resource: {resource_name}. "
96
+ f"{_recovery_hint(resource_name)}"
97
+ ) from exc
98
+
99
+ if not downloaded:
100
+ raise NLTKResourceError(
101
+ f"Could not download the NLTK resource: {resource_name}. "
102
+ f"{_recovery_hint(resource_name)}"
103
+ )
104
+
105
+ try:
106
+ nltk.data.find(resource_path)
107
+
108
+ except BadZipFile as exc:
109
+ _raise_corrupted_resource_error(resource_name, exc, after_download=True)
110
+
111
+ except Exception as exc:
112
+ raise NLTKResourceError(
113
+ f"The NLTK resource was downloaded but could not be located: "
114
+ f"{resource_name}. {_recovery_hint(resource_name)}"
115
+ ) from exc
116
+
117
+
118
+ def ensure_required_nltk_resources() -> None:
119
+ """
120
+ Ensures that all NLTK resources required by the package are available.
121
+ """
122
+ for resource_name in NLTK_RESOURCES:
123
+ ensure_nltk_resource(resource_name)
File without changes
@@ -0,0 +1,395 @@
1
+ from __future__ import annotations
2
+
3
+ from functools import cache
4
+ from typing import cast
5
+
6
+ import regex as re
7
+ from nltk.corpus import stopwords
8
+ from nltk.tokenize import sent_tokenize
9
+
10
+ from rp_segmentation.exceptions import InvalidSegmentationParameterError
11
+ from rp_segmentation.nltk_resources import ensure_nltk_resource
12
+
13
+ WORD_RE = re.compile(r"\p{L}+")
14
+
15
+
16
+ def _normalize_text(text: str) -> str:
17
+ """
18
+ Normalizes leading and trailing whitespace while preserving the internal
19
+ structure of the text.
20
+
21
+ Parameters
22
+ ----------
23
+ text:
24
+ Input text.
25
+
26
+ Returns
27
+ -------
28
+ str
29
+ Normalized text.
30
+
31
+ Raises
32
+ ------
33
+ TypeError
34
+ If the input value is not a string.
35
+ """
36
+ if not isinstance(text, str):
37
+ raise TypeError("The input text must be a string.")
38
+
39
+ return text.strip()
40
+
41
+
42
+ def _validate_language(language: str) -> str:
43
+ """
44
+ Validates and normalizes the language parameter.
45
+
46
+ Parameters
47
+ ----------
48
+ language:
49
+ Language identifier used by NLTK.
50
+
51
+ Returns
52
+ -------
53
+ str
54
+ Normalized language value.
55
+
56
+ Raises
57
+ ------
58
+ InvalidSegmentationParameterError
59
+ If the language value is invalid.
60
+ """
61
+ if not isinstance(language, str) or not language.strip():
62
+ raise InvalidSegmentationParameterError(
63
+ "The language parameter must be a non-empty string."
64
+ )
65
+
66
+ return language.strip().lower()
67
+
68
+
69
+ @cache
70
+ def _get_stopwords(language: str) -> frozenset[str]:
71
+ """
72
+ Loads and caches stopwords for the selected language.
73
+
74
+ Parameters
75
+ ----------
76
+ language:
77
+ Language used to retrieve NLTK stopwords.
78
+
79
+ Returns
80
+ -------
81
+ frozenset[str]
82
+ Cached set of stopwords.
83
+
84
+ Raises
85
+ ------
86
+ InvalidSegmentationParameterError
87
+ If stopwords are not available for the selected language.
88
+ """
89
+ language = _validate_language(language)
90
+
91
+ ensure_nltk_resource("stopwords")
92
+
93
+ try:
94
+ return frozenset(stopwords.words(language))
95
+
96
+ except OSError as exc:
97
+ raise InvalidSegmentationParameterError(
98
+ f"Stopwords are not available for language: {language}."
99
+ ) from exc
100
+
101
+
102
+ def get_tokens(
103
+ text: str,
104
+ language: str = "english",
105
+ remove_stopwords: bool = False,
106
+ ) -> list[str]:
107
+ """
108
+ Extracts lowercase word tokens from a text.
109
+
110
+ The tokenizer keeps Unicode letter characters and discards numbers,
111
+ punctuation marks, symbols, and empty values.
112
+
113
+ Parameters
114
+ ----------
115
+ text:
116
+ Input text.
117
+ language:
118
+ Language used when stopword removal is enabled.
119
+ Default is 'english'.
120
+ remove_stopwords:
121
+ Whether to remove stopwords from the resulting tokens.
122
+ Default is False.
123
+
124
+ Returns
125
+ -------
126
+ list[str]
127
+ List of lowercase word tokens.
128
+
129
+ Examples
130
+ --------
131
+ >>> get_tokens("Hello, John. How are you?")
132
+ ['hello', 'john', 'how', 'are', 'you']
133
+
134
+ >>> get_tokens("Hello, John. How are you?", remove_stopwords=True)
135
+ ['hello', 'john']
136
+ """
137
+ clean_text = _normalize_text(text)
138
+
139
+ if not clean_text:
140
+ return []
141
+
142
+ tokens = cast(list[str], WORD_RE.findall(clean_text.lower()))
143
+
144
+ if not remove_stopwords:
145
+ return tokens
146
+
147
+ stop_words = _get_stopwords(language)
148
+
149
+ return [token for token in tokens if token not in stop_words]
150
+
151
+
152
+ def _clean_segment(
153
+ text: str,
154
+ language: str = "english",
155
+ remove_stopwords: bool = False,
156
+ ) -> str:
157
+ """
158
+ Applies the package's canonical text normalization strategy to a segment
159
+ and returns the result as a single whitespace-normalized string.
160
+
161
+ Parameters
162
+ ----------
163
+ text:
164
+ Input sentence or paragraph.
165
+ language:
166
+ Language used when stopword removal is enabled.
167
+ Default is 'english'.
168
+ remove_stopwords:
169
+ Whether to remove stopwords from the resulting segment.
170
+ Default is False.
171
+
172
+ Returns
173
+ -------
174
+ str
175
+ Normalized segment.
176
+
177
+ Examples
178
+ --------
179
+ >>> _clean_segment("Hello, John. How are you?")
180
+ 'hello john how are you'
181
+
182
+ >>> _clean_segment("Hello, John. How are you?", remove_stopwords=True)
183
+ 'hello john'
184
+ """
185
+ return " ".join(
186
+ get_tokens(
187
+ text,
188
+ language=language,
189
+ remove_stopwords=remove_stopwords,
190
+ )
191
+ )
192
+
193
+
194
+ def sentence_segmentation(
195
+ text: str,
196
+ language: str = "english",
197
+ remove_stopwords: bool = False,
198
+ ) -> list[str]:
199
+ """
200
+ Segments a text into sentences and normalizes each resulting segment.
201
+
202
+ The function validates and normalizes the input text, ensures that the
203
+ required NLTK tokenizer resource is available, detects sentence boundaries
204
+ using the selected language, and discards empty segments after
205
+ normalization.
206
+
207
+ Parameters
208
+ ----------
209
+ text:
210
+ Input text to be segmented.
211
+ language:
212
+ Language used by NLTK for sentence boundary detection.
213
+ Default is 'english'.
214
+ remove_stopwords:
215
+ Whether to remove stopwords from each normalized sentence.
216
+ Default is False.
217
+
218
+ Returns
219
+ -------
220
+ list[str]
221
+ List of normalized sentence segments.
222
+
223
+ Examples
224
+ --------
225
+ >>> segment_sentences("Hello, John. How are you?")
226
+ ['hello john', 'how are you']
227
+
228
+ >>> segment_sentences("Hello, John. How are you?", remove_stopwords=True)
229
+ ['hello john']
230
+ """
231
+ language = _validate_language(language)
232
+
233
+ clean_text = _normalize_text(text)
234
+
235
+ if not clean_text:
236
+ return []
237
+
238
+ ensure_nltk_resource("punkt_tab")
239
+
240
+ sentences = sent_tokenize(clean_text, language=language)
241
+
242
+ return [
243
+ cleaned_sentence
244
+ for sentence in sentences
245
+ if (
246
+ cleaned_sentence := _clean_segment(
247
+ sentence,
248
+ language=language,
249
+ remove_stopwords=remove_stopwords,
250
+ )
251
+ )
252
+ ]
253
+
254
+
255
+ def paragraph_segmentation(
256
+ text: str,
257
+ language: str = "english",
258
+ remove_stopwords: bool = False,
259
+ ) -> list[str]:
260
+ """
261
+ Segments a text into paragraphs and normalizes each resulting segment.
262
+
263
+ Paragraph boundaries are detected using double or multiple line breaks.
264
+ Each paragraph is normalized, and empty segments produced after
265
+ normalization are discarded.
266
+
267
+ Parameters
268
+ ----------
269
+ text:
270
+ Input text to be segmented.
271
+ language:
272
+ Language used when stopword removal is enabled.
273
+ Default is 'english'.
274
+ remove_stopwords:
275
+ Whether to remove stopwords from each normalized paragraph.
276
+ Default is False.
277
+
278
+ Returns
279
+ -------
280
+ list[str]
281
+ List of normalized paragraph segments.
282
+
283
+ Examples
284
+ --------
285
+ >>> segment_paragraphs("First paragraph.\\n\\nSecond paragraph.")
286
+ ['first paragraph', 'second paragraph']
287
+
288
+ >>> segment_paragraphs(
289
+ "This is the first paragraph.\\n\\nThis is another one.",
290
+ remove_stopwords=True
291
+ )
292
+ ['first paragraph', 'another one']
293
+ """
294
+ language = _validate_language(language)
295
+
296
+ clean_text = _normalize_text(text)
297
+
298
+ if not clean_text:
299
+ return []
300
+
301
+ paragraphs = re.split(r"\n\s*\n+", clean_text)
302
+
303
+ return [
304
+ cleaned_paragraph
305
+ for paragraph in paragraphs
306
+ if (
307
+ cleaned_paragraph := _clean_segment(
308
+ paragraph,
309
+ language=language,
310
+ remove_stopwords=remove_stopwords,
311
+ )
312
+ )
313
+ ]
314
+
315
+
316
+ def n_stop_words_segmentation(
317
+ text: str, language: str = "english", n: int = 5, remove_stopwords: bool = False
318
+ ) -> list[str]:
319
+ """
320
+ A text is segmented every N stop words.
321
+
322
+ Parameters
323
+ ----------
324
+ text:
325
+ Input text to be segmented.
326
+ language:
327
+ Language used when stopword removal is enabled.
328
+ Default is 'english'.
329
+ n:
330
+ Number of stop words after which to segment the text.
331
+ Default is 5.
332
+ remove_stopwords:
333
+ Whether to remove stopwords from each segment.
334
+ Default is False.
335
+
336
+ Returns
337
+ -------
338
+ list[str]
339
+ List of segments created every N stop words.
340
+
341
+ """
342
+
343
+ language = _validate_language(language)
344
+
345
+ clean_text = _normalize_text(text)
346
+
347
+ if not clean_text:
348
+ return []
349
+
350
+ if n <= 0:
351
+ raise InvalidSegmentationParameterError(
352
+ "The number of stopwords must be greater than zero."
353
+ )
354
+
355
+ tokens = get_tokens(text)
356
+
357
+ n_tokens = len(tokens)
358
+
359
+ if n_tokens <= 0:
360
+ raise InvalidSegmentationParameterError(
361
+ "The number of tokens must be greater than zero."
362
+ )
363
+
364
+ stop_words = _get_stopwords(language)
365
+
366
+ stopword_count = sum(1 for token in tokens if token in stop_words)
367
+
368
+ if stopword_count / n_tokens < 0.2:
369
+ raise InvalidSegmentationParameterError(
370
+ "The text does not contain enough stopwords to be segmented."
371
+ )
372
+
373
+ segments = []
374
+ current_tokens = []
375
+ current_stopword_count = 0
376
+
377
+ for token in tokens:
378
+ if not remove_stopwords or token not in stop_words:
379
+ current_tokens.append(token)
380
+
381
+ if token in stop_words:
382
+ current_stopword_count += 1
383
+
384
+ if current_stopword_count == n:
385
+ if current_tokens:
386
+ segments.append(" ".join(current_tokens))
387
+
388
+ current_tokens = []
389
+
390
+ current_stopword_count = 0
391
+
392
+ if current_tokens:
393
+ segments.append(" ".join(current_tokens).strip())
394
+
395
+ return segments
@@ -0,0 +1,341 @@
1
+ Metadata-Version: 2.4
2
+ Name: rp-segmentation
3
+ Version: 0.1.0
4
+ Summary: A lightweight text segmentation and tokenization library for Python.
5
+ Author-email: Pablo Nicolás Ramos <pablonicolasramos.90@gmail.com>, Ricardo Daniel Perez <sanexto@gmail.com>
6
+ Maintainer-email: Pablo Nicolás Ramos <pablonicolasramos.90@gmail.com>, Ricardo Daniel Perez <sanexto@gmail.com>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/pablonicolasr/rp_segmentation
9
+ Project-URL: Repository, https://github.com/pablonicolasr/rp_segmentation
10
+ Project-URL: Issues, https://github.com/pablonicolasr/rp_segmentation/issues
11
+ Project-URL: Changelog, https://github.com/pablonicolasr/rp_segmentation/blob/main/CHANGELOG.md
12
+ Keywords: nlp,nltk,tokenization,text-processing,text-segmentation,natural-language-processing
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Text Processing
23
+ Classifier: Topic :: Text Processing :: Linguistic
24
+ Classifier: Typing :: Typed
25
+ Requires-Python: >=3.10
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: nltk<4.0,>=3.9
29
+ Requires-Dist: regex>=2024.0.0
30
+ Provides-Extra: dev
31
+ Requires-Dist: build>=1.2.0; extra == "dev"
32
+ Requires-Dist: mypy>=1.10.0; extra == "dev"
33
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
34
+ Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
35
+ Requires-Dist: ruff>=0.5.0; extra == "dev"
36
+ Requires-Dist: twine>=5.0.0; extra == "dev"
37
+ Requires-Dist: types-regex>=2024.0.0; extra == "dev"
38
+ Provides-Extra: docs
39
+ Requires-Dist: mkdocs<2.0.0,>=1.6.0; extra == "docs"
40
+ Requires-Dist: mkdocs-material<10.0.0,>=9.5.0; extra == "docs"
41
+ Dynamic: license-file
42
+
43
+ # rp-segmentation
44
+
45
+ `rp-segmentation` is a lightweight Python library for text segmentation, token normalization, and NLP-oriented preprocessing.
46
+
47
+ The package provides a simple and consistent API for splitting text into meaningful units, including sentences, paragraphs, and stopword-based segments. It is designed for text processing pipelines, NLP experimentation, semantic search, retrieval-augmented generation, and document preprocessing workflows.
48
+
49
+ ## Features
50
+
51
+ * Sentence segmentation using NLTK.
52
+ * Paragraph segmentation based on structural line breaks.
53
+ * Stopword-based segmentation every `N` stopwords.
54
+ * Unicode-aware token extraction.
55
+ * Optional stopword removal.
56
+ * Typed package support through `py.typed`.
57
+ * Lightweight and easy to integrate into NLP pipelines.
58
+
59
+ ## Installation
60
+
61
+ ```bash
62
+ pip install rp-segmentation
63
+ ```
64
+
65
+ ## Requirements
66
+
67
+ * Python 3.10 or higher.
68
+ * NLTK.
69
+ * regex.
70
+
71
+ ## NLTK Resources
72
+
73
+ `rp-segmentation` relies on external NLTK resources for sentence tokenization and stopword handling.
74
+
75
+ You can install the required resources manually:
76
+
77
+ ```bash
78
+ python -m nltk.downloader punkt_tab
79
+ python -m nltk.downloader stopwords
80
+ ```
81
+
82
+ Or install them directly from Python:
83
+
84
+ ```python
85
+ from rp_segmentation import ensure_required_nltk_resources
86
+
87
+ ensure_required_nltk_resources()
88
+ ```
89
+
90
+ ## Basic Usage
91
+
92
+ ```python
93
+ from rp_segmentation import (
94
+ sentence_segmentation,
95
+ paragraph_segmentation,
96
+ n_stop_words_segmentation,
97
+ )
98
+
99
+ text = """
100
+ Hello, Pablo. This is a simple test.
101
+
102
+ This is another paragraph with additional content.
103
+ It can be used for text processing workflows.
104
+ """
105
+
106
+ print(sentence_segmentation(text))
107
+ print(paragraph_segmentation(text))
108
+ print(n_stop_words_segmentation(text, n=3))
109
+ ```
110
+
111
+ ## Available Methods
112
+
113
+ ### `sentence_segmentation`
114
+
115
+ ```python
116
+ sentence_segmentation(
117
+ text: str,
118
+ language: str = "english",
119
+ remove_stopwords: bool = False,
120
+ ) -> list[str]
121
+ ```
122
+
123
+ Segments a text into sentences using NLTK and applies the package's internal normalization strategy to each resulting segment.
124
+
125
+ #### Example
126
+
127
+ ```python
128
+ from rp_segmentation import sentence_segmentation
129
+
130
+ text = "Hello, John. How are you?"
131
+
132
+ segments = sentence_segmentation(text)
133
+
134
+ print(segments)
135
+ ```
136
+
137
+ Output:
138
+
139
+ ```python
140
+ ["hello john", "how are you"]
141
+ ```
142
+
143
+ With stopword removal:
144
+
145
+ ```python
146
+ segments = sentence_segmentation(
147
+ text,
148
+ language="english",
149
+ remove_stopwords=True,
150
+ )
151
+
152
+ print(segments)
153
+ ```
154
+
155
+ Output:
156
+
157
+ ```python
158
+ ["hello john"]
159
+ ```
160
+
161
+ ---
162
+
163
+ ### `paragraph_segmentation`
164
+
165
+ ```python
166
+ paragraph_segmentation(
167
+ text: str,
168
+ language: str = "english",
169
+ remove_stopwords: bool = False,
170
+ ) -> list[str]
171
+ ```
172
+
173
+ Segments a text into paragraphs using double or multiple line breaks. Each paragraph is normalized before being returned.
174
+
175
+ #### Example
176
+
177
+ ```python
178
+ from rp_segmentation import paragraph_segmentation
179
+
180
+ text = "First paragraph.\n\nSecond paragraph."
181
+
182
+ segments = paragraph_segmentation(text)
183
+
184
+ print(segments)
185
+ ```
186
+
187
+ Output:
188
+
189
+ ```python
190
+ ["first paragraph", "second paragraph"]
191
+ ```
192
+
193
+ ---
194
+
195
+ ### `n_stop_words_segmentation`
196
+
197
+ ```python
198
+ n_stop_words_segmentation(
199
+ text: str,
200
+ language: str = "english",
201
+ n: int = 5,
202
+ remove_stopwords: bool = False,
203
+ ) -> list[str]
204
+ ```
205
+
206
+ Segments a text every `N` stopwords. This strategy is useful when working with natural language texts where stopword distribution can help define semantic or syntactic boundaries.
207
+
208
+ #### Example
209
+
210
+ ```python
211
+ from rp_segmentation import n_stop_words_segmentation
212
+
213
+ text = "Alpha the beta and gamma is delta of omega."
214
+
215
+ segments = n_stop_words_segmentation(
216
+ text,
217
+ language="english",
218
+ n=2,
219
+ )
220
+
221
+ print(segments)
222
+ ```
223
+
224
+ Output:
225
+
226
+ ```python
227
+ [
228
+ "alpha the beta and",
229
+ "gamma is delta of",
230
+ "omega",
231
+ ]
232
+ ```
233
+
234
+ With stopword removal:
235
+
236
+ ```python
237
+ segments = n_stop_words_segmentation(
238
+ text,
239
+ language="english",
240
+ n=2,
241
+ remove_stopwords=True,
242
+ )
243
+
244
+ print(segments)
245
+ ```
246
+
247
+ Output:
248
+
249
+ ```python
250
+ [
251
+ "alpha beta",
252
+ "gamma delta",
253
+ "omega",
254
+ ]
255
+ ```
256
+
257
+ ## Use Cases
258
+
259
+ `rp-segmentation` can be used in a wide range of text processing tasks, including:
260
+
261
+ * Natural Language Processing.
262
+ * Text normalization.
263
+ * Document preprocessing.
264
+ * Semantic search.
265
+ * Embedding preparation.
266
+ * Retrieval-Augmented Generation pipelines.
267
+ * Educational and research-oriented NLP projects.
268
+
269
+ ## Local Development
270
+
271
+ Clone the repository:
272
+
273
+ ```bash
274
+ git clone https://github.com/pablonicolasr777/rp-segmentation.git
275
+ cd rp-segmentation
276
+ ```
277
+
278
+ Create and activate a virtual environment:
279
+
280
+ ```bash
281
+ python -m venv .venv
282
+ .venv\Scripts\Activate.ps1
283
+ ```
284
+
285
+ Install the package with development dependencies:
286
+
287
+ ```bash
288
+ pip install -e ".[dev]"
289
+ ```
290
+
291
+ Install the required NLTK resources:
292
+
293
+ ```bash
294
+ python -m nltk.downloader punkt_tab
295
+ python -m nltk.downloader stopwords
296
+ ```
297
+
298
+ Run code quality checks:
299
+
300
+ ```bash
301
+ ruff check .
302
+ mypy src
303
+ pytest --cov=rp_segmentation --cov-report=term-missing
304
+ ```
305
+
306
+ ## Project Structure
307
+
308
+ ```text
309
+ rp-segmentation/
310
+ ├── src/
311
+ │ └── rp_segmentation/
312
+ │ ├── __init__.py
313
+ │ ├── segmenters.py
314
+ │ ├── nltk_resources.py
315
+ │ ├── exceptions.py
316
+ │ └── py.typed
317
+ ├── tests/
318
+ │ └── test_segmenters.py
319
+ ├── docs/
320
+ ├── .github/
321
+ │ └── workflows/
322
+ │ ├── ci.yml
323
+ │ └── publish.yml
324
+ ├── README.md
325
+ ├── CHANGELOG.md
326
+ ├── CONTRIBUTING.md
327
+ ├── SECURITY.md
328
+ ├── LICENSE
329
+ ├── pyproject.toml
330
+ ├── requirements-dev.txt
331
+ └── .gitignore
332
+ ```
333
+
334
+ ## Authors
335
+
336
+ * Pablo Nicolás Ramos
337
+ * Ricardo Daniel Perez
338
+
339
+ ## License
340
+
341
+ This project is licensed under the MIT License.
@@ -0,0 +1,10 @@
1
+ rp_segmentation/__init__.py,sha256=axOooewzkbk6ApbBJXE1mLXbb6fuIxD-zwD4Iccym3A,606
2
+ rp_segmentation/exceptions.py,sha256=sCdXAT8zulINoIK-gaipzcCPbCPXMCaz7podmkgFtVs,1045
3
+ rp_segmentation/nltk_resources.py,sha256=VuHm8n7UMmLmxmhwC8Pxa_0g_h5-OdXfOHqTRBPCNTY,3476
4
+ rp_segmentation/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ rp_segmentation/segmenters.py,sha256=T7zhwIarx0Y6qkYnd_PYiiwetdvikWxWhT6gUV7VTrY,9454
6
+ rp_segmentation-0.1.0.dist-info/licenses/LICENSE,sha256=RRGZoHMerRt3n08cFCuM281l7NzGNkcKwl6q634zugI,1097
7
+ rp_segmentation-0.1.0.dist-info/METADATA,sha256=GCgj8-gZDrkMNnpTZ85Hah-u2Xnk4mS91fw2JlFIoRU,7587
8
+ rp_segmentation-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
9
+ rp_segmentation-0.1.0.dist-info/top_level.txt,sha256=LZbx_li_bxf9E4N6nRyTN7HDuFNRnrFuXIYRFt1wG0I,16
10
+ rp_segmentation-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Pablo Nicolás Ramos and Ricardo Daniel Perez
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the Software), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ rp_segmentation