boundary-smart-splitter 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- boundary_smart_splitter-1.0.0/LICENSE +21 -0
- boundary_smart_splitter-1.0.0/PKG-INFO +324 -0
- boundary_smart_splitter-1.0.0/README.md +291 -0
- boundary_smart_splitter-1.0.0/boundary_smart_splitter/__init__.py +15 -0
- boundary_smart_splitter-1.0.0/boundary_smart_splitter/base.py +50 -0
- boundary_smart_splitter-1.0.0/boundary_smart_splitter/langchain/__init__.py +15 -0
- boundary_smart_splitter-1.0.0/boundary_smart_splitter/langchain/compat.py +205 -0
- boundary_smart_splitter-1.0.0/boundary_smart_splitter/v1_word.py +102 -0
- boundary_smart_splitter-1.0.0/boundary_smart_splitter/v2_sentence.py +209 -0
- boundary_smart_splitter-1.0.0/boundary_smart_splitter/v3_paragraph.py +220 -0
- boundary_smart_splitter-1.0.0/boundary_smart_splitter/v4_structure.py +235 -0
- boundary_smart_splitter-1.0.0/boundary_smart_splitter.egg-info/PKG-INFO +324 -0
- boundary_smart_splitter-1.0.0/boundary_smart_splitter.egg-info/SOURCES.txt +20 -0
- boundary_smart_splitter-1.0.0/boundary_smart_splitter.egg-info/dependency_links.txt +1 -0
- boundary_smart_splitter-1.0.0/boundary_smart_splitter.egg-info/requires.txt +10 -0
- boundary_smart_splitter-1.0.0/boundary_smart_splitter.egg-info/top_level.txt +1 -0
- boundary_smart_splitter-1.0.0/pyproject.toml +62 -0
- boundary_smart_splitter-1.0.0/setup.cfg +4 -0
- boundary_smart_splitter-1.0.0/tests/test_v1.py +82 -0
- boundary_smart_splitter-1.0.0/tests/test_v2.py +105 -0
- boundary_smart_splitter-1.0.0/tests/test_v3.py +139 -0
- boundary_smart_splitter-1.0.0/tests/test_v4.py +464 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 boundary-smart-splitter contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: boundary-smart-splitter
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A context-aware, boundary-respecting text splitter for Python.
|
|
5
|
+
Author-email: Jeet Mondal <JeetMondal777@users.noreply.github.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/JeetMondal777/boundary-smart-splitter
|
|
8
|
+
Project-URL: Repository, https://github.com/JeetMondal777/boundary-smart-splitter
|
|
9
|
+
Project-URL: Issues, https://github.com/JeetMondal777/boundary-smart-splitter/issues
|
|
10
|
+
Keywords: text,splitter,nlp,chunking,langchain
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
21
|
+
Requires-Python: >=3.8
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Provides-Extra: langchain
|
|
25
|
+
Requires-Dist: langchain-text-splitters>=0.0.1; extra == "langchain"
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
29
|
+
Requires-Dist: black; extra == "dev"
|
|
30
|
+
Requires-Dist: ruff; extra == "dev"
|
|
31
|
+
Requires-Dist: mypy; extra == "dev"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# boundary-smart-splitter
|
|
35
|
+
|
|
36
|
+
> A **boundary-first, size-second** text splitter for Python.
|
|
37
|
+
> LangChain-compatible. Framework-agnostic core. No ML dependencies.
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
Boundary-smart-splitter respects your text structure.
|
|
41
|
+
Word → Sentence → Paragraph → Section/Topic
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Most text splitters hit a character count and slice — regardless of whether they cut through a sentence, a paragraph, or a heading. This library flips that: **chunk_size is measured in semantic units** (words, sentences, paragraphs, sections), and `max_chars` acts only as a hard safety ceiling to protect your embedding model's context window.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install boundary-smart-splitter
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
For LangChain support, install the optional dependency:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install boundary-smart-splitter[langchain]
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**No ML dependencies required.** Every splitter below is deterministic, regex-based, and works 100% offline.
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Design Philosophy
|
|
65
|
+
|
|
66
|
+
| Principle | How it works |
|
|
67
|
+
|---|---|
|
|
68
|
+
| **Boundary-first, size-second** | Clean boundaries are more important than exact character counts |
|
|
69
|
+
| **Semantic units** | `chunk_size` means words for V1, sentences for V2, paragraphs for V3, sections for V4 |
|
|
70
|
+
| **max_chars as safety net** | Never exceeds your embedding model's context window, but only as a last resort |
|
|
71
|
+
| **Graceful fallback chain** | V4 → V3 → V2 → V1 → hard char-cut |
|
|
72
|
+
| **LangChain-compatible, not dependent** | The core never imports LangChain; wrappers are optional |
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Quick Start
|
|
77
|
+
|
|
78
|
+
### V1 — Word-Boundary Splitter
|
|
79
|
+
|
|
80
|
+
Never cuts a word in half. `chunk_size` = number of words.
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from boundary_smart_splitter import WordSplitter
|
|
84
|
+
|
|
85
|
+
text = "The quick brown fox jumps over the lazy dog. " * 100
|
|
86
|
+
splitter = WordSplitter(chunk_size=60, max_chars=500, tolerance=10)
|
|
87
|
+
chunks = splitter.split(text)
|
|
88
|
+
|
|
89
|
+
for chunk in chunks:
|
|
90
|
+
print(chunk)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
### V2 — Sentence-Boundary Splitter
|
|
96
|
+
|
|
97
|
+
Never cuts a sentence in half. Finds the nearest `.`, `?`, or `!` within tolerance.
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from boundary_smart_splitter import SentenceSplitter
|
|
101
|
+
|
|
102
|
+
text = "Hello world. How are you? I am fine! " * 100
|
|
103
|
+
splitter = SentenceSplitter(chunk_size=5, max_chars=500, tolerance=2)
|
|
104
|
+
chunks = splitter.split(text)
|
|
105
|
+
|
|
106
|
+
for chunk in chunks:
|
|
107
|
+
print(chunk)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
#### Custom Abbreviations
|
|
111
|
+
|
|
112
|
+
Prevent mid-abbreviation splits (e.g. `Dr.`, `U.S.A.`, `e.g.`):
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
splitter = SentenceSplitter(
|
|
116
|
+
chunk_size=5,
|
|
117
|
+
max_chars=500,
|
|
118
|
+
tolerance=2,
|
|
119
|
+
abbreviations={"Dr.", "U.S.A.", "e.g.", "i.e."},
|
|
120
|
+
)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
#### Boundary Preference
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
# Forward-first (default): scan forward for sentence end, then backward
|
|
127
|
+
splitter = SentenceSplitter(chunk_size=5, max_chars=500, tolerance=2, boundary_preference="forward")
|
|
128
|
+
|
|
129
|
+
# Backward-first: scan backward first, then forward
|
|
130
|
+
splitter = SentenceSplitter(chunk_size=5, max_chars=500, tolerance=2, boundary_preference="backward")
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
### V3 — Paragraph-Boundary Splitter
|
|
136
|
+
|
|
137
|
+
Never breaks a paragraph (`\n\n`). `chunk_size` = number of paragraphs.
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from boundary_smart_splitter import ParagraphSplitter
|
|
141
|
+
|
|
142
|
+
text = "Para one.\n\nPara two.\n\nPara three."
|
|
143
|
+
splitter = ParagraphSplitter(chunk_size=2, max_chars=500, tolerance=1)
|
|
144
|
+
chunks = splitter.split(text)
|
|
145
|
+
|
|
146
|
+
for chunk in chunks:
|
|
147
|
+
print(chunk)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
#### Markdown Mode
|
|
151
|
+
|
|
152
|
+
Respects horizontal rules and headings as extra paragraph boundaries:
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
splitter = ParagraphSplitter(
|
|
156
|
+
chunk_size=2,
|
|
157
|
+
max_chars=500,
|
|
158
|
+
tolerance=1,
|
|
159
|
+
use_markdown_mode=True,
|
|
160
|
+
)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
### V4 — Structure & Topic-Aware Splitter
|
|
166
|
+
|
|
167
|
+
Respects headings, numbered sections, and transition phrases. `chunk_size` = number of sections.
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
from boundary_smart_splitter import StructureSplitter
|
|
171
|
+
|
|
172
|
+
text = """
|
|
173
|
+
# Intro
|
|
174
|
+
Welcome to our proposal.
|
|
175
|
+
|
|
176
|
+
1. Home Page
|
|
177
|
+
The homepage will act as a strong first impression...
|
|
178
|
+
|
|
179
|
+
2. About Us Page
|
|
180
|
+
To build credibility and trust.
|
|
181
|
+
|
|
182
|
+
However, we must also consider costs.
|
|
183
|
+
|
|
184
|
+
3. Services Page
|
|
185
|
+
A detailed breakdown of all services.
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
splitter = StructureSplitter(
|
|
189
|
+
chunk_size=1,
|
|
190
|
+
max_chars=1500,
|
|
191
|
+
respect_headings=True,
|
|
192
|
+
respect_numbered_sections=True,
|
|
193
|
+
split_on_transitions=True,
|
|
194
|
+
)
|
|
195
|
+
chunks = splitter.split(text)
|
|
196
|
+
|
|
197
|
+
for chunk in chunks:
|
|
198
|
+
print("---")
|
|
199
|
+
print(chunk)
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
#### V4 Configuration
|
|
203
|
+
|
|
204
|
+
| Parameter | Default | Description |
|
|
205
|
+
|---|---|---|
|
|
206
|
+
| `respect_headings` | `True` | Treat `# Heading` as a hard boundary |
|
|
207
|
+
| `respect_numbered_sections` | `True` | Treat `1. Title` as a boundary |
|
|
208
|
+
| `split_on_transitions` | `True` | Treat "However," / "In summary," etc. as boundaries |
|
|
209
|
+
| `transition_phrases` | built-in list | Custom list of transition phrases |
|
|
210
|
+
| `double_newline_as_boundary` | `True` | Treat `\n\n\n` as a boundary |
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
## Fallback Chain
|
|
215
|
+
|
|
216
|
+
When a single unit exceeds `max_chars`, the library gracefully falls back to the next-smaller boundary:
|
|
217
|
+
|
|
218
|
+
```
|
|
219
|
+
StructureSplitter (V4)
|
|
220
|
+
→ section too big? → ParagraphSplitter (V3)
|
|
221
|
+
→ paragraph too big? → SentenceSplitter (V2)
|
|
222
|
+
→ sentence too big? → WordSplitter (V1)
|
|
223
|
+
→ word too big? → hard char-cut
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
This ensures you **never** get a chunk larger than `max_chars`, but you **always** get the cleanest possible boundary.
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## LangChain Integration
|
|
231
|
+
|
|
232
|
+
All splitters have LangChain-compatible wrappers. The core never imports LangChain — wrappers are optional.
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
from boundary_smart_splitter.langchain import (
|
|
236
|
+
LangChainWordSplitter,
|
|
237
|
+
LangChainSentenceSplitter,
|
|
238
|
+
LangChainParagraphSplitter,
|
|
239
|
+
LangChainStructureSplitter,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
splitter = LangChainStructureSplitter(chunk_size=1, max_chars=1500)
|
|
243
|
+
chunks = splitter.split_text("Your long document here...")
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
## API Comparison
|
|
249
|
+
|
|
250
|
+
| Splitter | `chunk_size` unit | `max_chars` role | Fallback | Key Params |
|
|
251
|
+
|---|---|---|---|---|
|
|
252
|
+
| `WordSplitter` | words | hard ceiling, never exceeded | hard char-cut | `tolerance` |
|
|
253
|
+
| `SentenceSplitter` | sentences | if sentence exceeds it, fall back to V1 | `WordSplitter` | `tolerance`, `boundary_preference`, `abbreviations` |
|
|
254
|
+
| `ParagraphSplitter` | paragraphs | if paragraph exceeds it, fall back to V2 | `SentenceSplitter` | `tolerance`, `overlap`, `paragraph_separator`, `use_markdown_mode` |
|
|
255
|
+
| `StructureSplitter` | sections / topics | if section exceeds it, fall back to V3 | `ParagraphSplitter` | `respect_headings`, `respect_numbered_sections`, `split_on_transitions` |
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## Why boundary-first matters
|
|
260
|
+
|
|
261
|
+
**Size-first** splitting (the common approach):
|
|
262
|
+
```
|
|
263
|
+
"...the quick brown fox jum" | "ps over the lazy dog..."
|
|
264
|
+
# Bad: cuts through a word
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
**Boundary-first** splitting (this library):
|
|
268
|
+
```
|
|
269
|
+
"...the quick brown fox" | "jumps over the lazy dog..."
|
|
270
|
+
# Good: clean word boundary
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
This becomes critical at sentence level:
|
|
274
|
+
```
|
|
275
|
+
"Please visit the U.S.A. for travel. Yes!"
|
|
276
|
+
# sentence-aware: kept together
|
|
277
|
+
# naive char-cut: "U.S.A." might get split
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
And at paragraph/section level for RAG:
|
|
281
|
+
```
|
|
282
|
+
"1. Home Page\n\nThe homepage..." | "2. About Us\n\nTo build..."
|
|
283
|
+
# section-aware: one chunk per page/section
|
|
284
|
+
# naive: cuts through the middle of a page description
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
---
|
|
288
|
+
|
|
289
|
+
## Features
|
|
290
|
+
|
|
291
|
+
- **Boundary-first, size-second**: clean boundaries always take priority over exact counts
|
|
292
|
+
- **Semantic units for `chunk_size`**: words count words, sentences count sentences, paragraphs count paragraphs, sections count sections
|
|
293
|
+
- **`max_chars` as universal safety net**: protects embedding model limits without breaking the semantic contract
|
|
294
|
+
- **Graceful fallback chain**: V4 → V3 → V2 → V1, each level catches what the level above can't handle
|
|
295
|
+
- **LangChain-compatible, not dependent**: the core never imports LangChain
|
|
296
|
+
- **Each version is a superset**: V2 includes V1, V3 includes出门 include V2, V4 includes V3
|
|
297
|
+
- **No ML dependencies**: fast, deterministic, offline-capable
|
|
298
|
+
- **Abbreviation-aware**: configurable abbreviation list prevents false sentence breaks
|
|
299
|
+
- **Markdown-aware**: optional Markdown paragraph/heading detection
|
|
300
|
+
- **PDF-friendly**: handles leading whitespace from PDF text extraction
|
|
301
|
+
|
|
302
|
+
---
|
|
303
|
+
|
|
304
|
+
## Requirements
|
|
305
|
+
|
|
306
|
+
- Python >= 3.9
|
|
307
|
+
|
|
308
|
+
---
|
|
309
|
+
|
|
310
|
+
## Contributing
|
|
311
|
+
|
|
312
|
+
Pull requests are welcome! For major changes, please open an issue first to discuss what you would like to change.
|
|
313
|
+
|
|
314
|
+
1. Fork the repository
|
|
315
|
+
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
|
|
316
|
+
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
|
317
|
+
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
|
318
|
+
5. Open a Pull Request
|
|
319
|
+
|
|
320
|
+
---
|
|
321
|
+
|
|
322
|
+
## License
|
|
323
|
+
|
|
324
|
+
MIT © 2026
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
# boundary-smart-splitter
|
|
2
|
+
|
|
3
|
+
> A **boundary-first, size-second** text splitter for Python.
|
|
4
|
+
> LangChain-compatible. Framework-agnostic core. No ML dependencies.
|
|
5
|
+
|
|
6
|
+
```
|
|
7
|
+
Boundary-smart-splitter respects your text structure.
|
|
8
|
+
Word → Sentence → Paragraph → Section/Topic
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Most text splitters hit a character count and slice — regardless of whether they cut through a sentence, a paragraph, or a heading. This library flips that: **chunk_size is measured in semantic units** (words, sentences, paragraphs, sections), and `max_chars` acts only as a hard safety ceiling to protect your embedding model's context window.
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install boundary-smart-splitter
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
For LangChain support, install the optional dependency:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install boundary-smart-splitter[langchain]
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
**No ML dependencies required.** Every splitter below is deterministic, regex-based, and works 100% offline.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Design Philosophy
|
|
32
|
+
|
|
33
|
+
| Principle | How it works |
|
|
34
|
+
|---|---|
|
|
35
|
+
| **Boundary-first, size-second** | Clean boundaries are more important than exact character counts |
|
|
36
|
+
| **Semantic units** | `chunk_size` means words for V1, sentences for V2, paragraphs for V3, sections for V4 |
|
|
37
|
+
| **max_chars as safety net** | Never exceeds your embedding model's context window, but only as a last resort |
|
|
38
|
+
| **Graceful fallback chain** | V4 → V3 → V2 → V1 → hard char-cut |
|
|
39
|
+
| **LangChain-compatible, not dependent** | The core never imports LangChain; wrappers are optional |
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Quick Start
|
|
44
|
+
|
|
45
|
+
### V1 — Word-Boundary Splitter
|
|
46
|
+
|
|
47
|
+
Never cuts a word in half. `chunk_size` = number of words.
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from boundary_smart_splitter import WordSplitter
|
|
51
|
+
|
|
52
|
+
text = "The quick brown fox jumps over the lazy dog. " * 100
|
|
53
|
+
splitter = WordSplitter(chunk_size=60, max_chars=500, tolerance=10)
|
|
54
|
+
chunks = splitter.split(text)
|
|
55
|
+
|
|
56
|
+
for chunk in chunks:
|
|
57
|
+
print(chunk)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
### V2 — Sentence-Boundary Splitter
|
|
63
|
+
|
|
64
|
+
Never cuts a sentence in half. Finds the nearest `.`, `?`, or `!` within tolerance.
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from boundary_smart_splitter import SentenceSplitter
|
|
68
|
+
|
|
69
|
+
text = "Hello world. How are you? I am fine! " * 100
|
|
70
|
+
splitter = SentenceSplitter(chunk_size=5, max_chars=500, tolerance=2)
|
|
71
|
+
chunks = splitter.split(text)
|
|
72
|
+
|
|
73
|
+
for chunk in chunks:
|
|
74
|
+
print(chunk)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
#### Custom Abbreviations
|
|
78
|
+
|
|
79
|
+
Prevent mid-abbreviation splits (e.g. `Dr.`, `U.S.A.`, `e.g.`):
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
splitter = SentenceSplitter(
|
|
83
|
+
chunk_size=5,
|
|
84
|
+
max_chars=500,
|
|
85
|
+
tolerance=2,
|
|
86
|
+
abbreviations={"Dr.", "U.S.A.", "e.g.", "i.e."},
|
|
87
|
+
)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
#### Boundary Preference
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
# Forward-first (default): scan forward for sentence end, then backward
|
|
94
|
+
splitter = SentenceSplitter(chunk_size=5, max_chars=500, tolerance=2, boundary_preference="forward")
|
|
95
|
+
|
|
96
|
+
# Backward-first: scan backward first, then forward
|
|
97
|
+
splitter = SentenceSplitter(chunk_size=5, max_chars=500, tolerance=2, boundary_preference="backward")
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
### V3 — Paragraph-Boundary Splitter
|
|
103
|
+
|
|
104
|
+
Never breaks a paragraph (`\n\n`). `chunk_size` = number of paragraphs.
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from boundary_smart_splitter import ParagraphSplitter
|
|
108
|
+
|
|
109
|
+
text = "Para one.\n\nPara two.\n\nPara three."
|
|
110
|
+
splitter = ParagraphSplitter(chunk_size=2, max_chars=500, tolerance=1)
|
|
111
|
+
chunks = splitter.split(text)
|
|
112
|
+
|
|
113
|
+
for chunk in chunks:
|
|
114
|
+
print(chunk)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
#### Markdown Mode
|
|
118
|
+
|
|
119
|
+
Respects horizontal rules and headings as extra paragraph boundaries:
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
splitter = ParagraphSplitter(
|
|
123
|
+
chunk_size=2,
|
|
124
|
+
max_chars=500,
|
|
125
|
+
tolerance=1,
|
|
126
|
+
use_markdown_mode=True,
|
|
127
|
+
)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
### V4 — Structure & Topic-Aware Splitter
|
|
133
|
+
|
|
134
|
+
Respects headings, numbered sections, and transition phrases. `chunk_size` = number of sections.
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from boundary_smart_splitter import StructureSplitter
|
|
138
|
+
|
|
139
|
+
text = """
|
|
140
|
+
# Intro
|
|
141
|
+
Welcome to our proposal.
|
|
142
|
+
|
|
143
|
+
1. Home Page
|
|
144
|
+
The homepage will act as a strong first impression...
|
|
145
|
+
|
|
146
|
+
2. About Us Page
|
|
147
|
+
To build credibility and trust.
|
|
148
|
+
|
|
149
|
+
However, we must also consider costs.
|
|
150
|
+
|
|
151
|
+
3. Services Page
|
|
152
|
+
A detailed breakdown of all services.
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
splitter = StructureSplitter(
|
|
156
|
+
chunk_size=1,
|
|
157
|
+
max_chars=1500,
|
|
158
|
+
respect_headings=True,
|
|
159
|
+
respect_numbered_sections=True,
|
|
160
|
+
split_on_transitions=True,
|
|
161
|
+
)
|
|
162
|
+
chunks = splitter.split(text)
|
|
163
|
+
|
|
164
|
+
for chunk in chunks:
|
|
165
|
+
print("---")
|
|
166
|
+
print(chunk)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
#### V4 Configuration
|
|
170
|
+
|
|
171
|
+
| Parameter | Default | Description |
|
|
172
|
+
|---|---|---|
|
|
173
|
+
| `respect_headings` | `True` | Treat `# Heading` as a hard boundary |
|
|
174
|
+
| `respect_numbered_sections` | `True` | Treat `1. Title` as a boundary |
|
|
175
|
+
| `split_on_transitions` | `True` | Treat "However," / "In summary," etc. as boundaries |
|
|
176
|
+
| `transition_phrases` | built-in list | Custom list of transition phrases |
|
|
177
|
+
| `double_newline_as_boundary` | `True` | Treat `\n\n\n` as a boundary |
|
|
178
|
+
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
## Fallback Chain
|
|
182
|
+
|
|
183
|
+
When a single unit exceeds `max_chars`, the library gracefully falls back to the next-smaller boundary:
|
|
184
|
+
|
|
185
|
+
```
|
|
186
|
+
StructureSplitter (V4)
|
|
187
|
+
→ section too big? → ParagraphSplitter (V3)
|
|
188
|
+
→ paragraph too big? → SentenceSplitter (V2)
|
|
189
|
+
→ sentence too big? → WordSplitter (V1)
|
|
190
|
+
→ word too big? → hard char-cut
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
This ensures you **never** get a chunk larger than `max_chars`, but you **always** get the cleanest possible boundary.
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## LangChain Integration
|
|
198
|
+
|
|
199
|
+
All splitters have LangChain-compatible wrappers. The core never imports LangChain — wrappers are optional.
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
from boundary_smart_splitter.langchain import (
|
|
203
|
+
LangChainWordSplitter,
|
|
204
|
+
LangChainSentenceSplitter,
|
|
205
|
+
LangChainParagraphSplitter,
|
|
206
|
+
LangChainStructureSplitter,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
splitter = LangChainStructureSplitter(chunk_size=1, max_chars=1500)
|
|
210
|
+
chunks = splitter.split_text("Your long document here...")
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
## API Comparison
|
|
216
|
+
|
|
217
|
+
| Splitter | `chunk_size` unit | `max_chars` role | Fallback | Key Params |
|
|
218
|
+
|---|---|---|---|---|
|
|
219
|
+
| `WordSplitter` | words | hard ceiling, never exceeded | hard char-cut | `tolerance` |
|
|
220
|
+
| `SentenceSplitter` | sentences | if sentence exceeds it, fall back to V1 | `WordSplitter` | `tolerance`, `boundary_preference`, `abbreviations` |
|
|
221
|
+
| `ParagraphSplitter` | paragraphs | if paragraph exceeds it, fall back to V2 | `SentenceSplitter` | `tolerance`, `overlap`, `paragraph_separator`, `use_markdown_mode` |
|
|
222
|
+
| `StructureSplitter` | sections / topics | if section exceeds it, fall back to V3 | `ParagraphSplitter` | `respect_headings`, `respect_numbered_sections`, `split_on_transitions` |
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Why boundary-first matters
|
|
227
|
+
|
|
228
|
+
**Size-first** splitting (the common approach):
|
|
229
|
+
```
|
|
230
|
+
"...the quick brown fox jum" | "ps over the lazy dog..."
|
|
231
|
+
# Bad: cuts through a word
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
**Boundary-first** splitting (this library):
|
|
235
|
+
```
|
|
236
|
+
"...the quick brown fox" | "jumps over the lazy dog..."
|
|
237
|
+
# Good: clean word boundary
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
This becomes critical at sentence level:
|
|
241
|
+
```
|
|
242
|
+
"Please visit the U.S.A. for travel. Yes!"
|
|
243
|
+
# sentence-aware: kept together
|
|
244
|
+
# naive char-cut: "U.S.A." might get split
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
And at paragraph/section level for RAG:
|
|
248
|
+
```
|
|
249
|
+
"1. Home Page\n\nThe homepage..." | "2. About Us\n\nTo build..."
|
|
250
|
+
# section-aware: one chunk per page/section
|
|
251
|
+
# naive: cuts through the middle of a page description
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
## Features
|
|
257
|
+
|
|
258
|
+
- **Boundary-first, size-second**: clean boundaries always take priority over exact counts
|
|
259
|
+
- **Semantic units for `chunk_size`**: words count words, sentences count sentences, paragraphs count paragraphs, sections count sections
|
|
260
|
+
- **`max_chars` as universal safety net**: protects embedding model limits without breaking the semantic contract
|
|
261
|
+
- **Graceful fallback chain**: V4 → V3 → V2 → V1, each level catches what the level above can't handle
|
|
262
|
+
- **LangChain-compatible, not dependent**: the core never imports LangChain
|
|
263
|
+
- **Each version is a superset**: V2 includes V1, V3 includes出门 include V2, V4 includes V3
|
|
264
|
+
- **No ML dependencies**: fast, deterministic, offline-capable
|
|
265
|
+
- **Abbreviation-aware**: configurable abbreviation list prevents false sentence breaks
|
|
266
|
+
- **Markdown-aware**: optional Markdown paragraph/heading detection
|
|
267
|
+
- **PDF-friendly**: handles leading whitespace from PDF text extraction
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## Requirements
|
|
272
|
+
|
|
273
|
+
- Python >= 3.9
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## Contributing
|
|
278
|
+
|
|
279
|
+
Pull requests are welcome! For major changes, please open an issue first to discuss what you would like to change.
|
|
280
|
+
|
|
281
|
+
1. Fork the repository
|
|
282
|
+
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
|
|
283
|
+
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
|
284
|
+
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
|
285
|
+
5. Open a Pull Request
|
|
286
|
+
|
|
287
|
+
---
|
|
288
|
+
|
|
289
|
+
## License
|
|
290
|
+
|
|
291
|
+
MIT © 2026
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""boundary-smart-splitter: A context-aware, boundary-respecting text splitter."""
|
|
2
|
+
|
|
3
|
+
from .v1_word import WordSplitter
|
|
4
|
+
from .v2_sentence import SentenceSplitter
|
|
5
|
+
from .v3_paragraph import ParagraphSplitter
|
|
6
|
+
from .v4_structure import StructureSplitter
|
|
7
|
+
|
|
8
|
+
__version__ = "1.0.0"
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"WordSplitter",
|
|
12
|
+
"SentenceSplitter",
|
|
13
|
+
"ParagraphSplitter",
|
|
14
|
+
"StructureSplitter",
|
|
15
|
+
]
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BaseSplitter(ABC):
|
|
8
|
+
"""Abstract base class for all splitters.
|
|
9
|
+
|
|
10
|
+
Each splitter measures ``chunk_size`` in its **own semantic unit** —
|
|
11
|
+
words, sentences, or paragraphs. A ``max_chars`` ceiling protects
|
|
12
|
+
embedding-model context limits across all versions.
|
|
13
|
+
|
|
14
|
+
Parameters
|
|
15
|
+
----------
|
|
16
|
+
chunk_size : int
|
|
17
|
+
Target count of semantic units (words / sentences / paragraphs)
|
|
18
|
+
per chunk.
|
|
19
|
+
max_chars : int
|
|
20
|
+
Hard character ceiling — no chunk ever exceeds this length.
|
|
21
|
+
tolerance : int
|
|
22
|
+
Number of semantic units to scan forward or backward when
|
|
23
|
+
adjusting a boundary so it lands on a clean break.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
*,
|
|
29
|
+
chunk_size: int,
|
|
30
|
+
max_chars: int,
|
|
31
|
+
tolerance: int,
|
|
32
|
+
) -> None:
|
|
33
|
+
if chunk_size <= 0:
|
|
34
|
+
raise ValueError("chunk_size must be a positive integer")
|
|
35
|
+
if max_chars <= 0:
|
|
36
|
+
raise ValueError("max_chars must be a positive integer")
|
|
37
|
+
if tolerance < 0:
|
|
38
|
+
raise ValueError("tolerance must be a non-negative integer")
|
|
39
|
+
self.chunk_size = chunk_size
|
|
40
|
+
self.max_chars = max_chars
|
|
41
|
+
self.tolerance = tolerance
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def split(self, text: str) -> List[str]:
|
|
45
|
+
"""Split *text* into chunks and return them."""
|
|
46
|
+
...
|
|
47
|
+
|
|
48
|
+
def _strip_and_filter(self, chunks: list[str]) -> list[str]:
|
|
49
|
+
"""Strip whitespace and remove empty strings from a list of chunks."""
|
|
50
|
+
return [chunk.strip() for chunk in chunks if chunk.strip()]
|