boundary-smart-splitter 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. boundary_smart_splitter-1.0.0/LICENSE +21 -0
  2. boundary_smart_splitter-1.0.0/PKG-INFO +324 -0
  3. boundary_smart_splitter-1.0.0/README.md +291 -0
  4. boundary_smart_splitter-1.0.0/boundary_smart_splitter/__init__.py +15 -0
  5. boundary_smart_splitter-1.0.0/boundary_smart_splitter/base.py +50 -0
  6. boundary_smart_splitter-1.0.0/boundary_smart_splitter/langchain/__init__.py +15 -0
  7. boundary_smart_splitter-1.0.0/boundary_smart_splitter/langchain/compat.py +205 -0
  8. boundary_smart_splitter-1.0.0/boundary_smart_splitter/v1_word.py +102 -0
  9. boundary_smart_splitter-1.0.0/boundary_smart_splitter/v2_sentence.py +209 -0
  10. boundary_smart_splitter-1.0.0/boundary_smart_splitter/v3_paragraph.py +220 -0
  11. boundary_smart_splitter-1.0.0/boundary_smart_splitter/v4_structure.py +235 -0
  12. boundary_smart_splitter-1.0.0/boundary_smart_splitter.egg-info/PKG-INFO +324 -0
  13. boundary_smart_splitter-1.0.0/boundary_smart_splitter.egg-info/SOURCES.txt +20 -0
  14. boundary_smart_splitter-1.0.0/boundary_smart_splitter.egg-info/dependency_links.txt +1 -0
  15. boundary_smart_splitter-1.0.0/boundary_smart_splitter.egg-info/requires.txt +10 -0
  16. boundary_smart_splitter-1.0.0/boundary_smart_splitter.egg-info/top_level.txt +1 -0
  17. boundary_smart_splitter-1.0.0/pyproject.toml +62 -0
  18. boundary_smart_splitter-1.0.0/setup.cfg +4 -0
  19. boundary_smart_splitter-1.0.0/tests/test_v1.py +82 -0
  20. boundary_smart_splitter-1.0.0/tests/test_v2.py +105 -0
  21. boundary_smart_splitter-1.0.0/tests/test_v3.py +139 -0
  22. boundary_smart_splitter-1.0.0/tests/test_v4.py +464 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 boundary-smart-splitter contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,324 @@
1
+ Metadata-Version: 2.4
2
+ Name: boundary-smart-splitter
3
+ Version: 1.0.0
4
+ Summary: A context-aware, boundary-respecting text splitter for Python.
5
+ Author-email: Jeet Mondal <JeetMondal777@users.noreply.github.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/JeetMondal777/boundary-smart-splitter
8
+ Project-URL: Repository, https://github.com/JeetMondal777/boundary-smart-splitter
9
+ Project-URL: Issues, https://github.com/JeetMondal777/boundary-smart-splitter/issues
10
+ Keywords: text,splitter,nlp,chunking,langchain
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Text Processing :: Linguistic
21
+ Requires-Python: >=3.8
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Provides-Extra: langchain
25
+ Requires-Dist: langchain-text-splitters>=0.0.1; extra == "langchain"
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=7.0; extra == "dev"
28
+ Requires-Dist: pytest-cov; extra == "dev"
29
+ Requires-Dist: black; extra == "dev"
30
+ Requires-Dist: ruff; extra == "dev"
31
+ Requires-Dist: mypy; extra == "dev"
32
+ Dynamic: license-file
33
+
34
+ # boundary-smart-splitter
35
+
36
+ > A **boundary-first, size-second** text splitter for Python.
37
+ > LangChain-compatible. Framework-agnostic core. No ML dependencies.
38
+
39
+ ```
40
+ Boundary-smart-splitter respects your text structure.
41
+ Word → Sentence → Paragraph → Section/Topic
42
+ ```
43
+
44
+ Most text splitters hit a character count and slice — regardless of whether they cut through a sentence, a paragraph, or a heading. This library flips that: **chunk_size is measured in semantic units** (words, sentences, paragraphs, sections), and `max_chars` acts only as a hard safety ceiling to protect your embedding model's context window.
45
+
46
+ ---
47
+
48
+ ## Installation
49
+
50
+ ```bash
51
+ pip install boundary-smart-splitter
52
+ ```
53
+
54
+ For LangChain support, install the optional dependency:
55
+
56
+ ```bash
57
+ pip install boundary-smart-splitter[langchain]
58
+ ```
59
+
60
+ **No ML dependencies required.** Every splitter below is deterministic, regex-based, and works 100% offline.
61
+
62
+ ---
63
+
64
+ ## Design Philosophy
65
+
66
+ | Principle | How it works |
67
+ |---|---|
68
+ | **Boundary-first, size-second** | Clean boundaries are more important than exact character counts |
69
+ | **Semantic units** | `chunk_size` means words for V1, sentences for V2, paragraphs for V3, sections for V4 |
70
+ | **max_chars as safety net** | Never exceeds your embedding model's context window, but only as a last resort |
71
+ | **Graceful fallback chain** | V4 → V3 → V2 → V1 → hard char-cut |
72
+ | **LangChain-compatible, not dependent** | The core never imports LangChain; wrappers are optional |
73
+
74
+ ---
75
+
76
+ ## Quick Start
77
+
78
+ ### V1 — Word-Boundary Splitter
79
+
80
+ Never cuts a word in half. `chunk_size` = number of words.
81
+
82
+ ```python
83
+ from boundary_smart_splitter import WordSplitter
84
+
85
+ text = "The quick brown fox jumps over the lazy dog. " * 100
86
+ splitter = WordSplitter(chunk_size=60, max_chars=500, tolerance=10)
87
+ chunks = splitter.split(text)
88
+
89
+ for chunk in chunks:
90
+ print(chunk)
91
+ ```
92
+
93
+ ---
94
+
95
+ ### V2 — Sentence-Boundary Splitter
96
+
97
+ Never cuts a sentence in half. Finds the nearest `.`, `?`, or `!` within tolerance.
98
+
99
+ ```python
100
+ from boundary_smart_splitter import SentenceSplitter
101
+
102
+ text = "Hello world. How are you? I am fine! " * 100
103
+ splitter = SentenceSplitter(chunk_size=5, max_chars=500, tolerance=2)
104
+ chunks = splitter.split(text)
105
+
106
+ for chunk in chunks:
107
+ print(chunk)
108
+ ```
109
+
110
+ #### Custom Abbreviations
111
+
112
+ Prevent mid-abbreviation splits (e.g. `Dr.`, `U.S.A.`, `e.g.`):
113
+
114
+ ```python
115
+ splitter = SentenceSplitter(
116
+ chunk_size=5,
117
+ max_chars=500,
118
+ tolerance=2,
119
+ abbreviations={"Dr.", "U.S.A.", "e.g.", "i.e."},
120
+ )
121
+ ```
122
+
123
+ #### Boundary Preference
124
+
125
+ ```python
126
+ # Forward-first (default): scan forward for sentence end, then backward
127
+ splitter = SentenceSplitter(chunk_size=5, max_chars=500, tolerance=2, boundary_preference="forward")
128
+
129
+ # Backward-first: scan backward first, then forward
130
+ splitter = SentenceSplitter(chunk_size=5, max_chars=500, tolerance=2, boundary_preference="backward")
131
+ ```
132
+
133
+ ---
134
+
135
+ ### V3 — Paragraph-Boundary Splitter
136
+
137
+ Never breaks a paragraph (`\n\n`). `chunk_size` = number of paragraphs.
138
+
139
+ ```python
140
+ from boundary_smart_splitter import ParagraphSplitter
141
+
142
+ text = "Para one.\n\nPara two.\n\nPara three."
143
+ splitter = ParagraphSplitter(chunk_size=2, max_chars=500, tolerance=1)
144
+ chunks = splitter.split(text)
145
+
146
+ for chunk in chunks:
147
+ print(chunk)
148
+ ```
149
+
150
+ #### Markdown Mode
151
+
152
+ Respects horizontal rules and headings as extra paragraph boundaries:
153
+
154
+ ```python
155
+ splitter = ParagraphSplitter(
156
+ chunk_size=2,
157
+ max_chars=500,
158
+ tolerance=1,
159
+ use_markdown_mode=True,
160
+ )
161
+ ```
162
+
163
+ ---
164
+
165
+ ### V4 — Structure & Topic-Aware Splitter
166
+
167
+ Respects headings, numbered sections, and transition phrases. `chunk_size` = number of sections.
168
+
169
+ ```python
170
+ from boundary_smart_splitter import StructureSplitter
171
+
172
+ text = """
173
+ # Intro
174
+ Welcome to our proposal.
175
+
176
+ 1. Home Page
177
+ The homepage will act as a strong first impression...
178
+
179
+ 2. About Us Page
180
+ To build credibility and trust.
181
+
182
+ However, we must also consider costs.
183
+
184
+ 3. Services Page
185
+ A detailed breakdown of all services.
186
+ """
187
+
188
+ splitter = StructureSplitter(
189
+ chunk_size=1,
190
+ max_chars=1500,
191
+ respect_headings=True,
192
+ respect_numbered_sections=True,
193
+ split_on_transitions=True,
194
+ )
195
+ chunks = splitter.split(text)
196
+
197
+ for chunk in chunks:
198
+ print("---")
199
+ print(chunk)
200
+ ```
201
+
202
+ #### V4 Configuration
203
+
204
+ | Parameter | Default | Description |
205
+ |---|---|---|
206
+ | `respect_headings` | `True` | Treat `# Heading` as a hard boundary |
207
+ | `respect_numbered_sections` | `True` | Treat `1. Title` as a boundary |
208
+ | `split_on_transitions` | `True` | Treat "However," / "In summary," etc. as boundaries |
209
+ | `transition_phrases` | built-in list | Custom list of transition phrases |
210
+ | `double_newline_as_boundary` | `True` | Treat `\n\n\n` as a boundary |
211
+
212
+ ---
213
+
214
+ ## Fallback Chain
215
+
216
+ When a single unit exceeds `max_chars`, the library gracefully falls back to the next-smaller boundary:
217
+
218
+ ```
219
+ StructureSplitter (V4)
220
+ → section too big? → ParagraphSplitter (V3)
221
+ → paragraph too big? → SentenceSplitter (V2)
222
+ → sentence too big? → WordSplitter (V1)
223
+ → word too big? → hard char-cut
224
+ ```
225
+
226
+ This ensures you **never** get a chunk larger than `max_chars`, but you **always** get the cleanest possible boundary.
227
+
228
+ ---
229
+
230
+ ## LangChain Integration
231
+
232
+ All splitters have LangChain-compatible wrappers. The core never imports LangChain — wrappers are optional.
233
+
234
+ ```python
235
+ from boundary_smart_splitter.langchain import (
236
+ LangChainWordSplitter,
237
+ LangChainSentenceSplitter,
238
+ LangChainParagraphSplitter,
239
+ LangChainStructureSplitter,
240
+ )
241
+
242
+ splitter = LangChainStructureSplitter(chunk_size=1, max_chars=1500)
243
+ chunks = splitter.split_text("Your long document here...")
244
+ ```
245
+
246
+ ---
247
+
248
+ ## API Comparison
249
+
250
+ | Splitter | `chunk_size` unit | `max_chars` role | Fallback | Key Params |
251
+ |---|---|---|---|---|
252
+ | `WordSplitter` | words | hard ceiling, never exceeded | hard char-cut | `tolerance` |
253
+ | `SentenceSplitter` | sentences | if sentence exceeds it, fall back to V1 | `WordSplitter` | `tolerance`, `boundary_preference`, `abbreviations` |
254
+ | `ParagraphSplitter` | paragraphs | if paragraph exceeds it, fall back to V2 | `SentenceSplitter` | `tolerance`, `overlap`, `paragraph_separator`, `use_markdown_mode` |
255
+ | `StructureSplitter` | sections / topics | if section exceeds it, fall back to V3 | `ParagraphSplitter` | `respect_headings`, `respect_numbered_sections`, `split_on_transitions` |
256
+
257
+ ---
258
+
259
+ ## Why boundary-first matters
260
+
261
+ **Size-first** splitting (the common approach):
262
+ ```
263
+ "...the quick brown fox jum" | "ps over the lazy dog..."
264
+ # Bad: cuts through a word
265
+ ```
266
+
267
+ **Boundary-first** splitting (this library):
268
+ ```
269
+ "...the quick brown fox" | "jumps over the lazy dog..."
270
+ # Good: clean word boundary
271
+ ```
272
+
273
+ This becomes critical at sentence level:
274
+ ```
275
+ "Please visit the U.S.A. for travel. Yes!"
276
+ # sentence-aware: kept together
277
+ # naive char-cut: "U.S.A." might get split
278
+ ```
279
+
280
+ And at paragraph/section level for RAG:
281
+ ```
282
+ "1. Home Page\n\nThe homepage..." | "2. About Us\n\nTo build..."
283
+ # section-aware: one chunk per page/section
284
+ # naive: cuts through the middle of a page description
285
+ ```
286
+
287
+ ---
288
+
289
+ ## Features
290
+
291
+ - **Boundary-first, size-second**: clean boundaries always take priority over exact counts
292
+ - **Semantic units for `chunk_size`**: words count words, sentences count sentences, paragraphs count paragraphs, sections count sections
293
+ - **`max_chars` as universal safety net**: protects embedding model limits without breaking the semantic contract
294
+ - **Graceful fallback chain**: V4 → V3 → V2 → V1, each level catches what the level above can't handle
295
+ - **LangChain-compatible, not dependent**: the core never imports LangChain
296
+ - **Each version is a superset**: V2 includes V1, V3 includes出门 include V2, V4 includes V3
297
+ - **No ML dependencies**: fast, deterministic, offline-capable
298
+ - **Abbreviation-aware**: configurable abbreviation list prevents false sentence breaks
299
+ - **Markdown-aware**: optional Markdown paragraph/heading detection
300
+ - **PDF-friendly**: handles leading whitespace from PDF text extraction
301
+
302
+ ---
303
+
304
+ ## Requirements
305
+
306
+ - Python >= 3.9
307
+
308
+ ---
309
+
310
+ ## Contributing
311
+
312
+ Pull requests are welcome! For major changes, please open an issue first to discuss what you would like to change.
313
+
314
+ 1. Fork the repository
315
+ 2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
316
+ 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
317
+ 4. Push to the branch (`git push origin feature/AmazingFeature`)
318
+ 5. Open a Pull Request
319
+
320
+ ---
321
+
322
+ ## License
323
+
324
+ MIT © 2026
@@ -0,0 +1,291 @@
1
+ # boundary-smart-splitter
2
+
3
+ > A **boundary-first, size-second** text splitter for Python.
4
+ > LangChain-compatible. Framework-agnostic core. No ML dependencies.
5
+
6
+ ```
7
+ Boundary-smart-splitter respects your text structure.
8
+ Word → Sentence → Paragraph → Section/Topic
9
+ ```
10
+
11
+ Most text splitters hit a character count and slice — regardless of whether they cut through a sentence, a paragraph, or a heading. This library flips that: **chunk_size is measured in semantic units** (words, sentences, paragraphs, sections), and `max_chars` acts only as a hard safety ceiling to protect your embedding model's context window.
12
+
13
+ ---
14
+
15
+ ## Installation
16
+
17
+ ```bash
18
+ pip install boundary-smart-splitter
19
+ ```
20
+
21
+ For LangChain support, install the optional dependency:
22
+
23
+ ```bash
24
+ pip install boundary-smart-splitter[langchain]
25
+ ```
26
+
27
+ **No ML dependencies required.** Every splitter below is deterministic, regex-based, and works 100% offline.
28
+
29
+ ---
30
+
31
+ ## Design Philosophy
32
+
33
+ | Principle | How it works |
34
+ |---|---|
35
+ | **Boundary-first, size-second** | Clean boundaries are more important than exact character counts |
36
+ | **Semantic units** | `chunk_size` means words for V1, sentences for V2, paragraphs for V3, sections for V4 |
37
+ | **max_chars as safety net** | Never exceeds your embedding model's context window, but only as a last resort |
38
+ | **Graceful fallback chain** | V4 → V3 → V2 → V1 → hard char-cut |
39
+ | **LangChain-compatible, not dependent** | The core never imports LangChain; wrappers are optional |
40
+
41
+ ---
42
+
43
+ ## Quick Start
44
+
45
+ ### V1 — Word-Boundary Splitter
46
+
47
+ Never cuts a word in half. `chunk_size` = number of words.
48
+
49
+ ```python
50
+ from boundary_smart_splitter import WordSplitter
51
+
52
+ text = "The quick brown fox jumps over the lazy dog. " * 100
53
+ splitter = WordSplitter(chunk_size=60, max_chars=500, tolerance=10)
54
+ chunks = splitter.split(text)
55
+
56
+ for chunk in chunks:
57
+ print(chunk)
58
+ ```
59
+
60
+ ---
61
+
62
+ ### V2 — Sentence-Boundary Splitter
63
+
64
+ Never cuts a sentence in half. Finds the nearest `.`, `?`, or `!` within tolerance.
65
+
66
+ ```python
67
+ from boundary_smart_splitter import SentenceSplitter
68
+
69
+ text = "Hello world. How are you? I am fine! " * 100
70
+ splitter = SentenceSplitter(chunk_size=5, max_chars=500, tolerance=2)
71
+ chunks = splitter.split(text)
72
+
73
+ for chunk in chunks:
74
+ print(chunk)
75
+ ```
76
+
77
+ #### Custom Abbreviations
78
+
79
+ Prevent mid-abbreviation splits (e.g. `Dr.`, `U.S.A.`, `e.g.`):
80
+
81
+ ```python
82
+ splitter = SentenceSplitter(
83
+ chunk_size=5,
84
+ max_chars=500,
85
+ tolerance=2,
86
+ abbreviations={"Dr.", "U.S.A.", "e.g.", "i.e."},
87
+ )
88
+ ```
89
+
90
+ #### Boundary Preference
91
+
92
+ ```python
93
+ # Forward-first (default): scan forward for sentence end, then backward
94
+ splitter = SentenceSplitter(chunk_size=5, max_chars=500, tolerance=2, boundary_preference="forward")
95
+
96
+ # Backward-first: scan backward first, then forward
97
+ splitter = SentenceSplitter(chunk_size=5, max_chars=500, tolerance=2, boundary_preference="backward")
98
+ ```
99
+
100
+ ---
101
+
102
+ ### V3 — Paragraph-Boundary Splitter
103
+
104
+ Never breaks a paragraph (`\n\n`). `chunk_size` = number of paragraphs.
105
+
106
+ ```python
107
+ from boundary_smart_splitter import ParagraphSplitter
108
+
109
+ text = "Para one.\n\nPara two.\n\nPara three."
110
+ splitter = ParagraphSplitter(chunk_size=2, max_chars=500, tolerance=1)
111
+ chunks = splitter.split(text)
112
+
113
+ for chunk in chunks:
114
+ print(chunk)
115
+ ```
116
+
117
+ #### Markdown Mode
118
+
119
+ Respects horizontal rules and headings as extra paragraph boundaries:
120
+
121
+ ```python
122
+ splitter = ParagraphSplitter(
123
+ chunk_size=2,
124
+ max_chars=500,
125
+ tolerance=1,
126
+ use_markdown_mode=True,
127
+ )
128
+ ```
129
+
130
+ ---
131
+
132
+ ### V4 — Structure & Topic-Aware Splitter
133
+
134
+ Respects headings, numbered sections, and transition phrases. `chunk_size` = number of sections.
135
+
136
+ ```python
137
+ from boundary_smart_splitter import StructureSplitter
138
+
139
+ text = """
140
+ # Intro
141
+ Welcome to our proposal.
142
+
143
+ 1. Home Page
144
+ The homepage will act as a strong first impression...
145
+
146
+ 2. About Us Page
147
+ To build credibility and trust.
148
+
149
+ However, we must also consider costs.
150
+
151
+ 3. Services Page
152
+ A detailed breakdown of all services.
153
+ """
154
+
155
+ splitter = StructureSplitter(
156
+ chunk_size=1,
157
+ max_chars=1500,
158
+ respect_headings=True,
159
+ respect_numbered_sections=True,
160
+ split_on_transitions=True,
161
+ )
162
+ chunks = splitter.split(text)
163
+
164
+ for chunk in chunks:
165
+ print("---")
166
+ print(chunk)
167
+ ```
168
+
169
+ #### V4 Configuration
170
+
171
+ | Parameter | Default | Description |
172
+ |---|---|---|
173
+ | `respect_headings` | `True` | Treat `# Heading` as a hard boundary |
174
+ | `respect_numbered_sections` | `True` | Treat `1. Title` as a boundary |
175
+ | `split_on_transitions` | `True` | Treat "However," / "In summary," etc. as boundaries |
176
+ | `transition_phrases` | built-in list | Custom list of transition phrases |
177
+ | `double_newline_as_boundary` | `True` | Treat `\n\n\n` as a boundary |
178
+
179
+ ---
180
+
181
+ ## Fallback Chain
182
+
183
+ When a single unit exceeds `max_chars`, the library gracefully falls back to the next-smaller boundary:
184
+
185
+ ```
186
+ StructureSplitter (V4)
187
+ → section too big? → ParagraphSplitter (V3)
188
+ → paragraph too big? → SentenceSplitter (V2)
189
+ → sentence too big? → WordSplitter (V1)
190
+ → word too big? → hard char-cut
191
+ ```
192
+
193
+ This ensures you **never** get a chunk larger than `max_chars`, but you **always** get the cleanest possible boundary.
194
+
195
+ ---
196
+
197
+ ## LangChain Integration
198
+
199
+ All splitters have LangChain-compatible wrappers. The core never imports LangChain — wrappers are optional.
200
+
201
+ ```python
202
+ from boundary_smart_splitter.langchain import (
203
+ LangChainWordSplitter,
204
+ LangChainSentenceSplitter,
205
+ LangChainParagraphSplitter,
206
+ LangChainStructureSplitter,
207
+ )
208
+
209
+ splitter = LangChainStructureSplitter(chunk_size=1, max_chars=1500)
210
+ chunks = splitter.split_text("Your long document here...")
211
+ ```
212
+
213
+ ---
214
+
215
+ ## API Comparison
216
+
217
+ | Splitter | `chunk_size` unit | `max_chars` role | Fallback | Key Params |
218
+ |---|---|---|---|---|
219
+ | `WordSplitter` | words | hard ceiling, never exceeded | hard char-cut | `tolerance` |
220
+ | `SentenceSplitter` | sentences | if sentence exceeds it, fall back to V1 | `WordSplitter` | `tolerance`, `boundary_preference`, `abbreviations` |
221
+ | `ParagraphSplitter` | paragraphs | if paragraph exceeds it, fall back to V2 | `SentenceSplitter` | `tolerance`, `overlap`, `paragraph_separator`, `use_markdown_mode` |
222
+ | `StructureSplitter` | sections / topics | if section exceeds it, fall back to V3 | `ParagraphSplitter` | `respect_headings`, `respect_numbered_sections`, `split_on_transitions` |
223
+
224
+ ---
225
+
226
+ ## Why boundary-first matters
227
+
228
+ **Size-first** splitting (the common approach):
229
+ ```
230
+ "...the quick brown fox jum" | "ps over the lazy dog..."
231
+ # Bad: cuts through a word
232
+ ```
233
+
234
+ **Boundary-first** splitting (this library):
235
+ ```
236
+ "...the quick brown fox" | "jumps over the lazy dog..."
237
+ # Good: clean word boundary
238
+ ```
239
+
240
+ This becomes critical at sentence level:
241
+ ```
242
+ "Please visit the U.S.A. for travel. Yes!"
243
+ # sentence-aware: kept together
244
+ # naive char-cut: "U.S.A." might get split
245
+ ```
246
+
247
+ And at paragraph/section level for RAG:
248
+ ```
249
+ "1. Home Page\n\nThe homepage..." | "2. About Us\n\nTo build..."
250
+ # section-aware: one chunk per page/section
251
+ # naive: cuts through the middle of a page description
252
+ ```
253
+
254
+ ---
255
+
256
+ ## Features
257
+
258
+ - **Boundary-first, size-second**: clean boundaries always take priority over exact counts
259
+ - **Semantic units for `chunk_size`**: words count words, sentences count sentences, paragraphs count paragraphs, sections count sections
260
+ - **`max_chars` as universal safety net**: protects embedding model limits without breaking the semantic contract
261
+ - **Graceful fallback chain**: V4 → V3 → V2 → V1, each level catches what the level above can't handle
262
+ - **LangChain-compatible, not dependent**: the core never imports LangChain
263
+ - **Each version is a superset**: V2 includes V1, V3 includes出门 include V2, V4 includes V3
264
+ - **No ML dependencies**: fast, deterministic, offline-capable
265
+ - **Abbreviation-aware**: configurable abbreviation list prevents false sentence breaks
266
+ - **Markdown-aware**: optional Markdown paragraph/heading detection
267
+ - **PDF-friendly**: handles leading whitespace from PDF text extraction
268
+
269
+ ---
270
+
271
+ ## Requirements
272
+
273
+ - Python >= 3.9
274
+
275
+ ---
276
+
277
+ ## Contributing
278
+
279
+ Pull requests are welcome! For major changes, please open an issue first to discuss what you would like to change.
280
+
281
+ 1. Fork the repository
282
+ 2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
283
+ 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
284
+ 4. Push to the branch (`git push origin feature/AmazingFeature`)
285
+ 5. Open a Pull Request
286
+
287
+ ---
288
+
289
+ ## License
290
+
291
+ MIT © 2026
@@ -0,0 +1,15 @@
1
+ """boundary-smart-splitter: A context-aware, boundary-respecting text splitter."""
2
+
3
+ from .v1_word import WordSplitter
4
+ from .v2_sentence import SentenceSplitter
5
+ from .v3_paragraph import ParagraphSplitter
6
+ from .v4_structure import StructureSplitter
7
+
8
+ __version__ = "1.0.0"
9
+
10
+ __all__ = [
11
+ "WordSplitter",
12
+ "SentenceSplitter",
13
+ "ParagraphSplitter",
14
+ "StructureSplitter",
15
+ ]
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import List
5
+
6
+
7
+ class BaseSplitter(ABC):
8
+ """Abstract base class for all splitters.
9
+
10
+ Each splitter measures ``chunk_size`` in its **own semantic unit** —
11
+ words, sentences, or paragraphs. A ``max_chars`` ceiling protects
12
+ embedding-model context limits across all versions.
13
+
14
+ Parameters
15
+ ----------
16
+ chunk_size : int
17
+ Target count of semantic units (words / sentences / paragraphs)
18
+ per chunk.
19
+ max_chars : int
20
+ Hard character ceiling — no chunk ever exceeds this length.
21
+ tolerance : int
22
+ Number of semantic units to scan forward or backward when
23
+ adjusting a boundary so it lands on a clean break.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ *,
29
+ chunk_size: int,
30
+ max_chars: int,
31
+ tolerance: int,
32
+ ) -> None:
33
+ if chunk_size <= 0:
34
+ raise ValueError("chunk_size must be a positive integer")
35
+ if max_chars <= 0:
36
+ raise ValueError("max_chars must be a positive integer")
37
+ if tolerance < 0:
38
+ raise ValueError("tolerance must be a non-negative integer")
39
+ self.chunk_size = chunk_size
40
+ self.max_chars = max_chars
41
+ self.tolerance = tolerance
42
+
43
+ @abstractmethod
44
+ def split(self, text: str) -> List[str]:
45
+ """Split *text* into chunks and return them."""
46
+ ...
47
+
48
+ def _strip_and_filter(self, chunks: list[str]) -> list[str]:
49
+ """Strip whitespace and remove empty strings from a list of chunks."""
50
+ return [chunk.strip() for chunk in chunks if chunk.strip()]