rp-segmentation 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rp_segmentation-0.1.0/LICENSE +21 -0
- rp_segmentation-0.1.0/PKG-INFO +341 -0
- rp_segmentation-0.1.0/README.md +299 -0
- rp_segmentation-0.1.0/pyproject.toml +120 -0
- rp_segmentation-0.1.0/setup.cfg +4 -0
- rp_segmentation-0.1.0/src/rp_segmentation/__init__.py +26 -0
- rp_segmentation-0.1.0/src/rp_segmentation/exceptions.py +37 -0
- rp_segmentation-0.1.0/src/rp_segmentation/nltk_resources.py +123 -0
- rp_segmentation-0.1.0/src/rp_segmentation/py.typed +0 -0
- rp_segmentation-0.1.0/src/rp_segmentation/segmenters.py +395 -0
- rp_segmentation-0.1.0/src/rp_segmentation.egg-info/PKG-INFO +341 -0
- rp_segmentation-0.1.0/src/rp_segmentation.egg-info/SOURCES.txt +16 -0
- rp_segmentation-0.1.0/src/rp_segmentation.egg-info/dependency_links.txt +1 -0
- rp_segmentation-0.1.0/src/rp_segmentation.egg-info/requires.txt +15 -0
- rp_segmentation-0.1.0/src/rp_segmentation.egg-info/top_level.txt +1 -0
- rp_segmentation-0.1.0/tests/test_init.py +36 -0
- rp_segmentation-0.1.0/tests/test_nltk_resources.py +260 -0
- rp_segmentation-0.1.0/tests/test_segmenters.py +811 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Pablo Nicolás Ramos and Ricardo Daniel Perez
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the Software), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rp-segmentation
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight text segmentation and tokenization library for Python.
|
|
5
|
+
Author-email: Pablo Nicolás Ramos <pablonicolasramos.90@gmail.com>, Ricardo Daniel Perez <sanexto@gmail.com>
|
|
6
|
+
Maintainer-email: Pablo Nicolás Ramos <pablonicolasramos.90@gmail.com>, Ricardo Daniel Perez <sanexto@gmail.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/pablonicolasr/rp_segmentation
|
|
9
|
+
Project-URL: Repository, https://github.com/pablonicolasr/rp_segmentation
|
|
10
|
+
Project-URL: Issues, https://github.com/pablonicolasr/rp_segmentation/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/pablonicolasr/rp_segmentation/blob/main/CHANGELOG.md
|
|
12
|
+
Keywords: nlp,nltk,tokenization,text-processing,text-segmentation,natural-language-processing
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Education
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Text Processing
|
|
23
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.10
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Requires-Dist: nltk<4.0,>=3.9
|
|
29
|
+
Requires-Dist: regex>=2024.0.0
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: build>=1.2.0; extra == "dev"
|
|
32
|
+
Requires-Dist: mypy>=1.10.0; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
|
|
35
|
+
Requires-Dist: ruff>=0.5.0; extra == "dev"
|
|
36
|
+
Requires-Dist: twine>=5.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: types-regex>=2024.0.0; extra == "dev"
|
|
38
|
+
Provides-Extra: docs
|
|
39
|
+
Requires-Dist: mkdocs<2.0.0,>=1.6.0; extra == "docs"
|
|
40
|
+
Requires-Dist: mkdocs-material<10.0.0,>=9.5.0; extra == "docs"
|
|
41
|
+
Dynamic: license-file
|
|
42
|
+
|
|
43
|
+
# rp-segmentation
|
|
44
|
+
|
|
45
|
+
`rp-segmentation` is a lightweight Python library for text segmentation, token normalization, and NLP-oriented preprocessing.
|
|
46
|
+
|
|
47
|
+
The package provides a simple and consistent API for splitting text into meaningful units, including sentences, paragraphs, and stopword-based segments. It is designed for text processing pipelines, NLP experimentation, semantic search, retrieval-augmented generation, and document preprocessing workflows.
|
|
48
|
+
|
|
49
|
+
## Features
|
|
50
|
+
|
|
51
|
+
* Sentence segmentation using NLTK.
|
|
52
|
+
* Paragraph segmentation based on structural line breaks.
|
|
53
|
+
* Stopword-based segmentation every `N` stopwords.
|
|
54
|
+
* Unicode-aware token extraction.
|
|
55
|
+
* Optional stopword removal.
|
|
56
|
+
* Typed package support through `py.typed`.
|
|
57
|
+
* Lightweight and easy to integrate into NLP pipelines.
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install rp-segmentation
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Requirements
|
|
66
|
+
|
|
67
|
+
* Python 3.10 or higher.
|
|
68
|
+
* NLTK.
|
|
69
|
+
* regex.
|
|
70
|
+
|
|
71
|
+
## NLTK Resources
|
|
72
|
+
|
|
73
|
+
`rp-segmentation` relies on external NLTK resources for sentence tokenization and stopword handling.
|
|
74
|
+
|
|
75
|
+
You can install the required resources manually:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
python -m nltk.downloader punkt_tab
|
|
79
|
+
python -m nltk.downloader stopwords
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Or install them directly from Python:
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from rp_segmentation import ensure_required_nltk_resources
|
|
86
|
+
|
|
87
|
+
ensure_required_nltk_resources()
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Basic Usage
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from rp_segmentation import (
|
|
94
|
+
sentence_segmentation,
|
|
95
|
+
paragraph_segmentation,
|
|
96
|
+
n_stop_words_segmentation,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
text = """
|
|
100
|
+
Hello, Pablo. This is a simple test.
|
|
101
|
+
|
|
102
|
+
This is another paragraph with additional content.
|
|
103
|
+
It can be used for text processing workflows.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
print(sentence_segmentation(text))
|
|
107
|
+
print(paragraph_segmentation(text))
|
|
108
|
+
print(n_stop_words_segmentation(text, n=3))
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Available Methods
|
|
112
|
+
|
|
113
|
+
### `sentence_segmentation`
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
sentence_segmentation(
|
|
117
|
+
text: str,
|
|
118
|
+
language: str = "english",
|
|
119
|
+
remove_stopwords: bool = False,
|
|
120
|
+
) -> list[str]
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Segments a text into sentences using NLTK and applies the package's internal normalization strategy to each resulting segment.
|
|
124
|
+
|
|
125
|
+
#### Example
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from rp_segmentation import sentence_segmentation
|
|
129
|
+
|
|
130
|
+
text = "Hello, John. How are you?"
|
|
131
|
+
|
|
132
|
+
segments = sentence_segmentation(text)
|
|
133
|
+
|
|
134
|
+
print(segments)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Output:
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
["hello john", "how are you"]
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
With stopword removal:
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
segments = sentence_segmentation(
|
|
147
|
+
text,
|
|
148
|
+
language="english",
|
|
149
|
+
remove_stopwords=True,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
print(segments)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Output:
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
["hello john"]
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
### `paragraph_segmentation`
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
paragraph_segmentation(
|
|
167
|
+
text: str,
|
|
168
|
+
language: str = "english",
|
|
169
|
+
remove_stopwords: bool = False,
|
|
170
|
+
) -> list[str]
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Segments a text into paragraphs using double or multiple line breaks. Each paragraph is normalized before being returned.
|
|
174
|
+
|
|
175
|
+
#### Example
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
from rp_segmentation import paragraph_segmentation
|
|
179
|
+
|
|
180
|
+
text = "First paragraph.\n\nSecond paragraph."
|
|
181
|
+
|
|
182
|
+
segments = paragraph_segmentation(text)
|
|
183
|
+
|
|
184
|
+
print(segments)
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Output:
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
["first paragraph", "second paragraph"]
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
### `n_stop_words_segmentation`
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
n_stop_words_segmentation(
|
|
199
|
+
text: str,
|
|
200
|
+
language: str = "english",
|
|
201
|
+
n: int = 5,
|
|
202
|
+
remove_stopwords: bool = False,
|
|
203
|
+
) -> list[str]
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
Segments a text every `N` stopwords. This strategy is useful when working with natural language texts where stopword distribution can help define semantic or syntactic boundaries.
|
|
207
|
+
|
|
208
|
+
#### Example
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
from rp_segmentation import n_stop_words_segmentation
|
|
212
|
+
|
|
213
|
+
text = "Alpha the beta and gamma is delta of omega."
|
|
214
|
+
|
|
215
|
+
segments = n_stop_words_segmentation(
|
|
216
|
+
text,
|
|
217
|
+
language="english",
|
|
218
|
+
n=2,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
print(segments)
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
Output:
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
[
|
|
228
|
+
"alpha the beta and",
|
|
229
|
+
"gamma is delta of",
|
|
230
|
+
"omega",
|
|
231
|
+
]
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
With stopword removal:
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
segments = n_stop_words_segmentation(
|
|
238
|
+
text,
|
|
239
|
+
language="english",
|
|
240
|
+
n=2,
|
|
241
|
+
remove_stopwords=True,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
print(segments)
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
Output:
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
[
|
|
251
|
+
"alpha beta",
|
|
252
|
+
"gamma delta",
|
|
253
|
+
"omega",
|
|
254
|
+
]
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
## Use Cases
|
|
258
|
+
|
|
259
|
+
`rp-segmentation` can be used in a wide range of text processing tasks, including:
|
|
260
|
+
|
|
261
|
+
* Natural Language Processing.
|
|
262
|
+
* Text normalization.
|
|
263
|
+
* Document preprocessing.
|
|
264
|
+
* Semantic search.
|
|
265
|
+
* Embedding preparation.
|
|
266
|
+
* Retrieval-Augmented Generation pipelines.
|
|
267
|
+
* Educational and research-oriented NLP projects.
|
|
268
|
+
|
|
269
|
+
## Local Development
|
|
270
|
+
|
|
271
|
+
Clone the repository:
|
|
272
|
+
|
|
273
|
+
```bash
|
|
274
|
+
git clone https://github.com/pablonicolasr777/rp-segmentation.git
|
|
275
|
+
cd rp-segmentation
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
Create and activate a virtual environment:
|
|
279
|
+
|
|
280
|
+
```bash
|
|
281
|
+
python -m venv .venv
|
|
282
|
+
.venv\Scripts\Activate.ps1
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
Install the package with development dependencies:
|
|
286
|
+
|
|
287
|
+
```bash
|
|
288
|
+
pip install -e ".[dev]"
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
Install the required NLTK resources:
|
|
292
|
+
|
|
293
|
+
```bash
|
|
294
|
+
python -m nltk.downloader punkt_tab
|
|
295
|
+
python -m nltk.downloader stopwords
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
Run code quality checks:
|
|
299
|
+
|
|
300
|
+
```bash
|
|
301
|
+
ruff check .
|
|
302
|
+
mypy src
|
|
303
|
+
pytest --cov=rp_segmentation --cov-report=term-missing
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
## Project Structure
|
|
307
|
+
|
|
308
|
+
```text
|
|
309
|
+
rp-segmentation/
|
|
310
|
+
├── src/
|
|
311
|
+
│ └── rp_segmentation/
|
|
312
|
+
│ ├── __init__.py
|
|
313
|
+
│ ├── segmenters.py
|
|
314
|
+
│ ├── nltk_resources.py
|
|
315
|
+
│ ├── exceptions.py
|
|
316
|
+
│ └── py.typed
|
|
317
|
+
├── tests/
|
|
318
|
+
│ └── test_segmenters.py
|
|
319
|
+
├── docs/
|
|
320
|
+
├── .github/
|
|
321
|
+
│ └── workflows/
|
|
322
|
+
│ ├── ci.yml
|
|
323
|
+
│ └── publish.yml
|
|
324
|
+
├── README.md
|
|
325
|
+
├── CHANGELOG.md
|
|
326
|
+
├── CONTRIBUTING.md
|
|
327
|
+
├── SECURITY.md
|
|
328
|
+
├── LICENSE
|
|
329
|
+
├── pyproject.toml
|
|
330
|
+
├── requirements-dev.txt
|
|
331
|
+
└── .gitignore
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
## Authors
|
|
335
|
+
|
|
336
|
+
* Pablo Nicolás Ramos
|
|
337
|
+
* Ricardo Daniel Perez
|
|
338
|
+
|
|
339
|
+
## License
|
|
340
|
+
|
|
341
|
+
This project is licensed under the MIT License.
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
# rp-segmentation
|
|
2
|
+
|
|
3
|
+
`rp-segmentation` is a lightweight Python library for text segmentation, token normalization, and NLP-oriented preprocessing.
|
|
4
|
+
|
|
5
|
+
The package provides a simple and consistent API for splitting text into meaningful units, including sentences, paragraphs, and stopword-based segments. It is designed for text processing pipelines, NLP experimentation, semantic search, retrieval-augmented generation, and document preprocessing workflows.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
* Sentence segmentation using NLTK.
|
|
10
|
+
* Paragraph segmentation based on structural line breaks.
|
|
11
|
+
* Stopword-based segmentation every `N` stopwords.
|
|
12
|
+
* Unicode-aware token extraction.
|
|
13
|
+
* Optional stopword removal.
|
|
14
|
+
* Typed package support through `py.typed`.
|
|
15
|
+
* Lightweight and easy to integrate into NLP pipelines.
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install rp-segmentation
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Requirements
|
|
24
|
+
|
|
25
|
+
* Python 3.10 or higher.
|
|
26
|
+
* NLTK.
|
|
27
|
+
* regex.
|
|
28
|
+
|
|
29
|
+
## NLTK Resources
|
|
30
|
+
|
|
31
|
+
`rp-segmentation` relies on external NLTK resources for sentence tokenization and stopword handling.
|
|
32
|
+
|
|
33
|
+
You can install the required resources manually:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
python -m nltk.downloader punkt_tab
|
|
37
|
+
python -m nltk.downloader stopwords
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Or install them directly from Python:
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from rp_segmentation import ensure_required_nltk_resources
|
|
44
|
+
|
|
45
|
+
ensure_required_nltk_resources()
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Basic Usage
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from rp_segmentation import (
|
|
52
|
+
sentence_segmentation,
|
|
53
|
+
paragraph_segmentation,
|
|
54
|
+
n_stop_words_segmentation,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
text = """
|
|
58
|
+
Hello, Pablo. This is a simple test.
|
|
59
|
+
|
|
60
|
+
This is another paragraph with additional content.
|
|
61
|
+
It can be used for text processing workflows.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
print(sentence_segmentation(text))
|
|
65
|
+
print(paragraph_segmentation(text))
|
|
66
|
+
print(n_stop_words_segmentation(text, n=3))
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Available Methods
|
|
70
|
+
|
|
71
|
+
### `sentence_segmentation`
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
sentence_segmentation(
|
|
75
|
+
text: str,
|
|
76
|
+
language: str = "english",
|
|
77
|
+
remove_stopwords: bool = False,
|
|
78
|
+
) -> list[str]
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Segments a text into sentences using NLTK and applies the package's internal normalization strategy to each resulting segment.
|
|
82
|
+
|
|
83
|
+
#### Example
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from rp_segmentation import sentence_segmentation
|
|
87
|
+
|
|
88
|
+
text = "Hello, John. How are you?"
|
|
89
|
+
|
|
90
|
+
segments = sentence_segmentation(text)
|
|
91
|
+
|
|
92
|
+
print(segments)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Output:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
["hello john", "how are you"]
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
With stopword removal:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
segments = sentence_segmentation(
|
|
105
|
+
text,
|
|
106
|
+
language="english",
|
|
107
|
+
remove_stopwords=True,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
print(segments)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Output:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
["hello john"]
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
### `paragraph_segmentation`
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
paragraph_segmentation(
|
|
125
|
+
text: str,
|
|
126
|
+
language: str = "english",
|
|
127
|
+
remove_stopwords: bool = False,
|
|
128
|
+
) -> list[str]
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Segments a text into paragraphs using double or multiple line breaks. Each paragraph is normalized before being returned.
|
|
132
|
+
|
|
133
|
+
#### Example
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
from rp_segmentation import paragraph_segmentation
|
|
137
|
+
|
|
138
|
+
text = "First paragraph.\n\nSecond paragraph."
|
|
139
|
+
|
|
140
|
+
segments = paragraph_segmentation(text)
|
|
141
|
+
|
|
142
|
+
print(segments)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Output:
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
["first paragraph", "second paragraph"]
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
### `n_stop_words_segmentation`
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
n_stop_words_segmentation(
|
|
157
|
+
text: str,
|
|
158
|
+
language: str = "english",
|
|
159
|
+
n: int = 5,
|
|
160
|
+
remove_stopwords: bool = False,
|
|
161
|
+
) -> list[str]
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Segments a text every `N` stopwords. This strategy is useful when working with natural language texts where stopword distribution can help define semantic or syntactic boundaries.
|
|
165
|
+
|
|
166
|
+
#### Example
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
from rp_segmentation import n_stop_words_segmentation
|
|
170
|
+
|
|
171
|
+
text = "Alpha the beta and gamma is delta of omega."
|
|
172
|
+
|
|
173
|
+
segments = n_stop_words_segmentation(
|
|
174
|
+
text,
|
|
175
|
+
language="english",
|
|
176
|
+
n=2,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
print(segments)
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Output:
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
[
|
|
186
|
+
"alpha the beta and",
|
|
187
|
+
"gamma is delta of",
|
|
188
|
+
"omega",
|
|
189
|
+
]
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
With stopword removal:
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
segments = n_stop_words_segmentation(
|
|
196
|
+
text,
|
|
197
|
+
language="english",
|
|
198
|
+
n=2,
|
|
199
|
+
remove_stopwords=True,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
print(segments)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
Output:
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
[
|
|
209
|
+
"alpha beta",
|
|
210
|
+
"gamma delta",
|
|
211
|
+
"omega",
|
|
212
|
+
]
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## Use Cases
|
|
216
|
+
|
|
217
|
+
`rp-segmentation` can be used in a wide range of text processing tasks, including:
|
|
218
|
+
|
|
219
|
+
* Natural Language Processing.
|
|
220
|
+
* Text normalization.
|
|
221
|
+
* Document preprocessing.
|
|
222
|
+
* Semantic search.
|
|
223
|
+
* Embedding preparation.
|
|
224
|
+
* Retrieval-Augmented Generation pipelines.
|
|
225
|
+
* Educational and research-oriented NLP projects.
|
|
226
|
+
|
|
227
|
+
## Local Development
|
|
228
|
+
|
|
229
|
+
Clone the repository:
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
git clone https://github.com/pablonicolasr777/rp-segmentation.git
|
|
233
|
+
cd rp-segmentation
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
Create and activate a virtual environment:
|
|
237
|
+
|
|
238
|
+
```bash
|
|
239
|
+
python -m venv .venv
|
|
240
|
+
.venv\Scripts\Activate.ps1
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
Install the package with development dependencies:
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
pip install -e ".[dev]"
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
Install the required NLTK resources:
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
python -m nltk.downloader punkt_tab
|
|
253
|
+
python -m nltk.downloader stopwords
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
Run code quality checks:
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
ruff check .
|
|
260
|
+
mypy src
|
|
261
|
+
pytest --cov=rp_segmentation --cov-report=term-missing
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
## Project Structure
|
|
265
|
+
|
|
266
|
+
```text
|
|
267
|
+
rp-segmentation/
|
|
268
|
+
├── src/
|
|
269
|
+
│ └── rp_segmentation/
|
|
270
|
+
│ ├── __init__.py
|
|
271
|
+
│ ├── segmenters.py
|
|
272
|
+
│ ├── nltk_resources.py
|
|
273
|
+
│ ├── exceptions.py
|
|
274
|
+
│ └── py.typed
|
|
275
|
+
├── tests/
|
|
276
|
+
│ └── test_segmenters.py
|
|
277
|
+
├── docs/
|
|
278
|
+
├── .github/
|
|
279
|
+
│ └── workflows/
|
|
280
|
+
│ ├── ci.yml
|
|
281
|
+
│ └── publish.yml
|
|
282
|
+
├── README.md
|
|
283
|
+
├── CHANGELOG.md
|
|
284
|
+
├── CONTRIBUTING.md
|
|
285
|
+
├── SECURITY.md
|
|
286
|
+
├── LICENSE
|
|
287
|
+
├── pyproject.toml
|
|
288
|
+
├── requirements-dev.txt
|
|
289
|
+
└── .gitignore
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
## Authors
|
|
293
|
+
|
|
294
|
+
* Pablo Nicolás Ramos
|
|
295
|
+
* Ricardo Daniel Perez
|
|
296
|
+
|
|
297
|
+
## License
|
|
298
|
+
|
|
299
|
+
This project is licensed under the MIT License.
|