@exulu/backend 1.48.2 → 1.49.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +351 -42
- package/dist/index.d.cts +96 -1
- package/dist/index.d.ts +96 -1
- package/dist/index.js +340 -38
- package/ee/{markdown.ts → chunking/markdown.ts} +2 -2
- package/ee/python/README.md +295 -0
- package/ee/python/documents/processing/README.md +155 -0
- package/ee/{documents → python/documents}/processing/doc_processor.ts +25 -17
- package/ee/{documents/processing/pdf_to_markdown.py → python/documents/processing/document_to_markdown.py} +3 -10
- package/ee/python/setup.sh +180 -0
- package/package.json +14 -3
- package/scripts/postinstall.cjs +149 -0
- package/.agents/skills/mintlify/SKILL.md +0 -347
- package/.editorconfig +0 -15
- package/.eslintrc.json +0 -52
- package/.github/workflows/release-backend.yml +0 -38
- package/.husky/commit-msg +0 -1
- package/.jscpd.json +0 -18
- package/.mcp.json +0 -25
- package/.nvmrc +0 -1
- package/.prettierignore +0 -5
- package/.prettierrc.json +0 -12
- package/CHANGELOG.md +0 -8
- package/SECURITY.md +0 -5
- package/commitlint.config.js +0 -4
- package/devops/documentation/patch-older-releases.md +0 -42
- package/ee/documents/processing/build_pdf_processor.sh +0 -35
- package/ee/documents/processing/chunk_markdown.py +0 -263
- package/ee/documents/processing/pdf_processor.spec +0 -115
- package/eslint.config.js +0 -88
- package/jest.config.ts +0 -25
- package/mintlify-docs/.mintignore +0 -7
- package/mintlify-docs/AGENTS.md +0 -33
- package/mintlify-docs/CLAUDE.MD +0 -50
- package/mintlify-docs/CONTRIBUTING.md +0 -32
- package/mintlify-docs/LICENSE +0 -21
- package/mintlify-docs/README.md +0 -55
- package/mintlify-docs/ai-tools/claude-code.mdx +0 -43
- package/mintlify-docs/ai-tools/cursor.mdx +0 -39
- package/mintlify-docs/ai-tools/windsurf.mdx +0 -39
- package/mintlify-docs/api-reference/core-types/agent-types.mdx +0 -110
- package/mintlify-docs/api-reference/core-types/analytics-types.mdx +0 -95
- package/mintlify-docs/api-reference/core-types/configuration-types.mdx +0 -83
- package/mintlify-docs/api-reference/core-types/evaluation-types.mdx +0 -106
- package/mintlify-docs/api-reference/core-types/job-types.mdx +0 -135
- package/mintlify-docs/api-reference/core-types/overview.mdx +0 -73
- package/mintlify-docs/api-reference/core-types/prompt-types.mdx +0 -102
- package/mintlify-docs/api-reference/core-types/rbac-types.mdx +0 -163
- package/mintlify-docs/api-reference/core-types/session-types.mdx +0 -77
- package/mintlify-docs/api-reference/core-types/user-management.mdx +0 -112
- package/mintlify-docs/api-reference/core-types/workflow-types.mdx +0 -88
- package/mintlify-docs/api-reference/core-types.mdx +0 -585
- package/mintlify-docs/api-reference/dynamic-types.mdx +0 -851
- package/mintlify-docs/api-reference/endpoint/create.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/delete.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/get.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/webhook.mdx +0 -4
- package/mintlify-docs/api-reference/introduction.mdx +0 -661
- package/mintlify-docs/api-reference/mutations.mdx +0 -1012
- package/mintlify-docs/api-reference/openapi.json +0 -217
- package/mintlify-docs/api-reference/queries.mdx +0 -1154
- package/mintlify-docs/backend/introduction.mdx +0 -218
- package/mintlify-docs/changelog.mdx +0 -387
- package/mintlify-docs/community-edition.mdx +0 -304
- package/mintlify-docs/core/exulu-agent/api-reference.mdx +0 -894
- package/mintlify-docs/core/exulu-agent/configuration.mdx +0 -690
- package/mintlify-docs/core/exulu-agent/introduction.mdx +0 -552
- package/mintlify-docs/core/exulu-app/api-reference.mdx +0 -481
- package/mintlify-docs/core/exulu-app/configuration.mdx +0 -319
- package/mintlify-docs/core/exulu-app/introduction.mdx +0 -117
- package/mintlify-docs/core/exulu-authentication.mdx +0 -810
- package/mintlify-docs/core/exulu-chunkers/api-reference.mdx +0 -1011
- package/mintlify-docs/core/exulu-chunkers/configuration.mdx +0 -596
- package/mintlify-docs/core/exulu-chunkers/introduction.mdx +0 -403
- package/mintlify-docs/core/exulu-context/api-reference.mdx +0 -911
- package/mintlify-docs/core/exulu-context/configuration.mdx +0 -648
- package/mintlify-docs/core/exulu-context/introduction.mdx +0 -394
- package/mintlify-docs/core/exulu-database.mdx +0 -811
- package/mintlify-docs/core/exulu-default-agents.mdx +0 -545
- package/mintlify-docs/core/exulu-eval/api-reference.mdx +0 -772
- package/mintlify-docs/core/exulu-eval/configuration.mdx +0 -680
- package/mintlify-docs/core/exulu-eval/introduction.mdx +0 -459
- package/mintlify-docs/core/exulu-logging.mdx +0 -464
- package/mintlify-docs/core/exulu-otel.mdx +0 -670
- package/mintlify-docs/core/exulu-queues/api-reference.mdx +0 -648
- package/mintlify-docs/core/exulu-queues/configuration.mdx +0 -650
- package/mintlify-docs/core/exulu-queues/introduction.mdx +0 -474
- package/mintlify-docs/core/exulu-reranker/api-reference.mdx +0 -630
- package/mintlify-docs/core/exulu-reranker/configuration.mdx +0 -663
- package/mintlify-docs/core/exulu-reranker/introduction.mdx +0 -516
- package/mintlify-docs/core/exulu-tool/api-reference.mdx +0 -723
- package/mintlify-docs/core/exulu-tool/configuration.mdx +0 -805
- package/mintlify-docs/core/exulu-tool/introduction.mdx +0 -539
- package/mintlify-docs/core/exulu-variables/api-reference.mdx +0 -699
- package/mintlify-docs/core/exulu-variables/configuration.mdx +0 -736
- package/mintlify-docs/core/exulu-variables/introduction.mdx +0 -511
- package/mintlify-docs/development.mdx +0 -94
- package/mintlify-docs/docs.json +0 -248
- package/mintlify-docs/enterprise-edition.mdx +0 -538
- package/mintlify-docs/essentials/code.mdx +0 -35
- package/mintlify-docs/essentials/images.mdx +0 -59
- package/mintlify-docs/essentials/markdown.mdx +0 -88
- package/mintlify-docs/essentials/navigation.mdx +0 -87
- package/mintlify-docs/essentials/reusable-snippets.mdx +0 -110
- package/mintlify-docs/essentials/settings.mdx +0 -318
- package/mintlify-docs/favicon.svg +0 -3
- package/mintlify-docs/frontend/introduction.mdx +0 -39
- package/mintlify-docs/getting-started.mdx +0 -267
- package/mintlify-docs/guides/custom-agent.mdx +0 -608
- package/mintlify-docs/guides/first-agent.mdx +0 -315
- package/mintlify-docs/images/admin_ui.png +0 -0
- package/mintlify-docs/images/contexts.png +0 -0
- package/mintlify-docs/images/create_agents.png +0 -0
- package/mintlify-docs/images/evals.png +0 -0
- package/mintlify-docs/images/graphql.png +0 -0
- package/mintlify-docs/images/graphql_api.png +0 -0
- package/mintlify-docs/images/hero-dark.png +0 -0
- package/mintlify-docs/images/hero-light.png +0 -0
- package/mintlify-docs/images/hero.png +0 -0
- package/mintlify-docs/images/knowledge_sources.png +0 -0
- package/mintlify-docs/images/mcp.png +0 -0
- package/mintlify-docs/images/scaling.png +0 -0
- package/mintlify-docs/index.mdx +0 -411
- package/mintlify-docs/logo/dark.svg +0 -9
- package/mintlify-docs/logo/light.svg +0 -9
- package/mintlify-docs/partners.mdx +0 -558
- package/mintlify-docs/products.mdx +0 -77
- package/mintlify-docs/snippets/snippet-intro.mdx +0 -4
- package/mintlify-docs/styles.css +0 -207
- package/ngrok.bash +0 -1
- package/ngrok.md +0 -6
- package/ngrok.yml +0 -10
- package/release.config.cjs +0 -15
- package/skills-lock.json +0 -10
- package/types/context-processor.ts +0 -45
- package/types/enums/eval-types.ts +0 -5
- package/types/enums/field-types.ts +0 -1
- package/types/enums/jobs.ts +0 -11
- package/types/enums/statistics.ts +0 -13
- package/types/exulu-table-definition.ts +0 -79
- package/types/file-types.ts +0 -18
- package/types/models/agent-session.ts +0 -27
- package/types/models/agent.ts +0 -68
- package/types/models/context.ts +0 -53
- package/types/models/embedding.ts +0 -17
- package/types/models/eval-run.ts +0 -40
- package/types/models/exulu-agent-tool-config.ts +0 -11
- package/types/models/item.ts +0 -21
- package/types/models/job.ts +0 -8
- package/types/models/project.ts +0 -16
- package/types/models/rate-limiter-rules.ts +0 -7
- package/types/models/test-case.ts +0 -25
- package/types/models/tool.ts +0 -9
- package/types/models/user-role.ts +0 -12
- package/types/models/user.ts +0 -20
- package/types/models/variable.ts +0 -8
- package/types/models/vector-methods.ts +0 -7
- package/types/provider-config.ts +0 -21
- package/types/queue-config.ts +0 -16
- package/types/rbac-rights-modes.ts +0 -1
- package/types/statistics.ts +0 -20
- package/types/workflow.ts +0 -31
- /package/ee/{documents → python/documents}/THIRD_PARTY_LICENSES/docling.txt +0 -0
- /package/ee/{documents/processing → python}/requirements.txt +0 -0
|
@@ -1,596 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
title: "Configuration"
|
|
3
|
-
description: "Complete guide to configuring ExuluChunkers for text splitting"
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
## SentenceChunker configuration
|
|
7
|
-
|
|
8
|
-
### Factory method
|
|
9
|
-
|
|
10
|
-
Create a SentenceChunker using the async factory method:
|
|
11
|
-
|
|
12
|
-
```typescript
|
|
13
|
-
import { ExuluChunkers } from "@exulu/backend";
|
|
14
|
-
|
|
15
|
-
const chunker = await ExuluChunkers.sentence.create(options);
|
|
16
|
-
```
|
|
17
|
-
|
|
18
|
-
### Options
|
|
19
|
-
|
|
20
|
-
<ParamField path="chunkSize" type="number" required>
|
|
21
|
-
Maximum number of tokens per chunk
|
|
22
|
-
</ParamField>
|
|
23
|
-
|
|
24
|
-
```typescript
|
|
25
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
26
|
-
chunkSize: 512 // Max 512 tokens per chunk
|
|
27
|
-
});
|
|
28
|
-
```
|
|
29
|
-
|
|
30
|
-
**Guidelines:**
|
|
31
|
-
- Small chunks (128-256): High granularity, more chunks, higher costs
|
|
32
|
-
- Medium chunks (256-512): Balanced for most use cases
|
|
33
|
-
- Large chunks (512-1024): Less granular, fewer chunks, lower costs
|
|
34
|
-
|
|
35
|
-
**Match your embedding model:**
|
|
36
|
-
- OpenAI text-embedding-3-small: 8,191 tokens → use 512-1024
|
|
37
|
-
- OpenAI text-embedding-3-large: 8,191 tokens → use 512-1024
|
|
38
|
-
- Cohere embed-english-v3.0: 512 tokens → use 256-512
|
|
39
|
-
|
|
40
|
-
<ParamField path="chunkOverlap" type="number" default={0}>
|
|
41
|
-
Number of tokens to overlap between consecutive chunks (default: 0)
|
|
42
|
-
</ParamField>
|
|
43
|
-
|
|
44
|
-
```typescript
|
|
45
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
46
|
-
chunkSize: 512,
|
|
47
|
-
chunkOverlap: 50 // 50 tokens overlap
|
|
48
|
-
});
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
**Guidelines:**
|
|
52
|
-
- No overlap (0): No redundancy, sharp boundaries
|
|
53
|
-
- Low overlap (10-20): Minimal context preservation
|
|
54
|
-
- Medium overlap (50-100): Good balance for natural language
|
|
55
|
-
- High overlap (100-200): Maximum context, but increases chunk count
|
|
56
|
-
|
|
57
|
-
**Recommended overlap ratios:**
|
|
58
|
-
- 10-15% of chunk size for technical docs
|
|
59
|
-
- 15-20% of chunk size for natural language
|
|
60
|
-
- 20-25% of chunk size for narrative content
|
|
61
|
-
|
|
62
|
-
<Warning>
|
|
63
|
-
Overlap must be less than `chunkSize`. The chunker throws an error if `chunkOverlap >= chunkSize`.
|
|
64
|
-
</Warning>
|
|
65
|
-
|
|
66
|
-
<ParamField path="minSentencesPerChunk" type="number" default={1}>
|
|
67
|
-
Minimum number of sentences per chunk (default: 1)
|
|
68
|
-
</ParamField>
|
|
69
|
-
|
|
70
|
-
```typescript
|
|
71
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
72
|
-
chunkSize: 512,
|
|
73
|
-
minSentencesPerChunk: 2 // At least 2 sentences per chunk
|
|
74
|
-
});
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
**Use cases:**
|
|
78
|
-
- `1`: Allow single-sentence chunks (default)
|
|
79
|
-
- `2-3`: Ensure contextual coherence
|
|
80
|
-
- `3+`: For documents where individual sentences lack context
|
|
81
|
-
|
|
82
|
-
<ParamField path="minCharactersPerSentence" type="number" default={10}>
|
|
83
|
-
Minimum character length for a text segment to be considered a sentence (default: 10)
|
|
84
|
-
</ParamField>
|
|
85
|
-
|
|
86
|
-
```typescript
|
|
87
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
88
|
-
chunkSize: 512,
|
|
89
|
-
minCharactersPerSentence: 20 // Sentences must be at least 20 chars
|
|
90
|
-
});
|
|
91
|
-
```
|
|
92
|
-
|
|
93
|
-
**Use cases:**
|
|
94
|
-
- `5-10`: Allow short sentences (e.g., "Yes.", "No.")
|
|
95
|
-
- `10-20`: Filter out fragments (default)
|
|
96
|
-
- `20+`: Ensure substantive sentences
|
|
97
|
-
|
|
98
|
-
### Complete example
|
|
99
|
-
|
|
100
|
-
```typescript
|
|
101
|
-
import { ExuluChunkers } from "@exulu/backend";
|
|
102
|
-
|
|
103
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
104
|
-
chunkSize: 512, // Max 512 tokens per chunk
|
|
105
|
-
chunkOverlap: 75, // 75 tokens overlap (15% of chunk size)
|
|
106
|
-
minSentencesPerChunk: 2, // At least 2 sentences per chunk
|
|
107
|
-
minCharactersPerSentence: 15 // Sentences must be at least 15 chars
|
|
108
|
-
});
|
|
109
|
-
|
|
110
|
-
const text = `
|
|
111
|
-
Machine learning is transforming industries. It enables
|
|
112
|
-
computers to learn patterns from data. This technology
|
|
113
|
-
powers recommendation systems, fraud detection, and more.
|
|
114
|
-
|
|
115
|
-
Deep learning is a subset of machine learning. It uses
|
|
116
|
-
neural networks with many layers. These networks can
|
|
117
|
-
recognize complex patterns in images, text, and audio.
|
|
118
|
-
`;
|
|
119
|
-
|
|
120
|
-
const chunks = await chunker(text);
|
|
121
|
-
|
|
122
|
-
console.log(chunks.length); // Number of chunks
|
|
123
|
-
console.log(chunks[0].text); // First chunk text
|
|
124
|
-
console.log(chunks[0].tokenCount); // Token count
|
|
125
|
-
```
|
|
126
|
-
|
|
127
|
-
## RecursiveChunker configuration
|
|
128
|
-
|
|
129
|
-
### Factory method
|
|
130
|
-
|
|
131
|
-
Create a RecursiveChunker using the async factory method:
|
|
132
|
-
|
|
133
|
-
```typescript
|
|
134
|
-
import { ExuluChunkers } from "@exulu/backend";
|
|
135
|
-
|
|
136
|
-
const chunker = await ExuluChunkers.recursive.function.create(options);
|
|
137
|
-
```
|
|
138
|
-
|
|
139
|
-
### Options
|
|
140
|
-
|
|
141
|
-
<ParamField path="chunkSize" type="number" required>
|
|
142
|
-
Maximum number of tokens per chunk
|
|
143
|
-
</ParamField>
|
|
144
|
-
|
|
145
|
-
```typescript
|
|
146
|
-
const chunker = await ExuluChunkers.recursive.function.create({
|
|
147
|
-
chunkSize: 1024 // Max 1024 tokens per chunk
|
|
148
|
-
});
|
|
149
|
-
```
|
|
150
|
-
|
|
151
|
-
Same guidelines as SentenceChunker `chunkSize`.
|
|
152
|
-
|
|
153
|
-
<ParamField path="rules" type="RecursiveRules" default="default rules">
|
|
154
|
-
Recursive splitting rules defining the hierarchy (default: paragraphs → sentences → pauses → words → tokens)
|
|
155
|
-
</ParamField>
|
|
156
|
-
|
|
157
|
-
```typescript
|
|
158
|
-
// Use default rules
|
|
159
|
-
const chunker = await ExuluChunkers.recursive.function.create({
|
|
160
|
-
chunkSize: 1024
|
|
161
|
-
// rules not specified = default rules
|
|
162
|
-
});
|
|
163
|
-
|
|
164
|
-
// Or specify custom rules
|
|
165
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
166
|
-
levels: [
|
|
167
|
-
{ delimiters: ["\n\n"] }, // Split by double newline
|
|
168
|
-
{ delimiters: [". ", "! ", "? "] }, // Then sentences
|
|
169
|
-
{ whitespace: true } // Then whitespace
|
|
170
|
-
]
|
|
171
|
-
});
|
|
172
|
-
|
|
173
|
-
const customChunker = await ExuluChunkers.recursive.function.create({
|
|
174
|
-
chunkSize: 1024,
|
|
175
|
-
rules: rules
|
|
176
|
-
});
|
|
177
|
-
```
|
|
178
|
-
|
|
179
|
-
**Default rules hierarchy:**
|
|
180
|
-
1. **Paragraphs**: `["\n\n", "\r\n", "\n", "\r"]`
|
|
181
|
-
2. **Sentences**: `[". ", "! ", "? "]`
|
|
182
|
-
3. **Pauses**: `["{", "}", '"', "[", "]", "<", ">", "(", ")", ":", ";", ",", "—", "|", "~", "-", "...", "`", "'"]`
|
|
183
|
-
4. **Words**: `whitespace: true`
|
|
184
|
-
5. **Tokens**: No delimiters (fallback to token-level splitting)
|
|
185
|
-
|
|
186
|
-
<ParamField path="minCharactersPerChunk" type="number" default={50}>
|
|
187
|
-
Minimum character length for a chunk (default: 50)
|
|
188
|
-
</ParamField>
|
|
189
|
-
|
|
190
|
-
```typescript
|
|
191
|
-
const chunker = await ExuluChunkers.recursive.function.create({
|
|
192
|
-
chunkSize: 1024,
|
|
193
|
-
minCharactersPerChunk: 100 // Chunks must be at least 100 chars
|
|
194
|
-
});
|
|
195
|
-
```
|
|
196
|
-
|
|
197
|
-
**Guidelines:**
|
|
198
|
-
- `20-50`: Allow smaller chunks
|
|
199
|
-
- `50-100`: Default range
|
|
200
|
-
- `100+`: Ensure substantive chunks
|
|
201
|
-
|
|
202
|
-
### Complete example
|
|
203
|
-
|
|
204
|
-
```typescript
|
|
205
|
-
import { ExuluChunkers } from "@exulu/backend";
|
|
206
|
-
|
|
207
|
-
// Custom rules for markdown documents
|
|
208
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
209
|
-
levels: [
|
|
210
|
-
{ delimiters: ["\n## ", "\n### "] }, // Split by headers
|
|
211
|
-
{ delimiters: ["\n\n"] }, // Then paragraphs
|
|
212
|
-
{ delimiters: [". ", "! ", "? "] }, // Then sentences
|
|
213
|
-
{ whitespace: true } // Then words
|
|
214
|
-
]
|
|
215
|
-
});
|
|
216
|
-
|
|
217
|
-
const chunker = await ExuluChunkers.recursive.function.create({
|
|
218
|
-
chunkSize: 1024,
|
|
219
|
-
rules: rules,
|
|
220
|
-
minCharactersPerChunk: 75
|
|
221
|
-
});
|
|
222
|
-
|
|
223
|
-
const markdown = `
|
|
224
|
-
## Introduction
|
|
225
|
-
|
|
226
|
-
Machine learning enables computers to learn from data.
|
|
227
|
-
It powers many modern applications.
|
|
228
|
-
|
|
229
|
-
## Applications
|
|
230
|
-
|
|
231
|
-
Recommendation systems use ML to suggest content.
|
|
232
|
-
Fraud detection systems identify suspicious activity.
|
|
233
|
-
|
|
234
|
-
## Future Directions
|
|
235
|
-
|
|
236
|
-
The field continues to evolve rapidly.
|
|
237
|
-
New techniques emerge regularly.
|
|
238
|
-
`;
|
|
239
|
-
|
|
240
|
-
const chunks = await chunker(markdown);
|
|
241
|
-
|
|
242
|
-
for (const chunk of chunks) {
|
|
243
|
-
console.log(`Level ${chunk.level}: ${chunk.text.slice(0, 50)}...`);
|
|
244
|
-
console.log(`Tokens: ${chunk.tokenCount}`);
|
|
245
|
-
}
|
|
246
|
-
```
|
|
247
|
-
|
|
248
|
-
## RecursiveRules configuration
|
|
249
|
-
|
|
250
|
-
### Constructor
|
|
251
|
-
|
|
252
|
-
Create custom recursive rules:
|
|
253
|
-
|
|
254
|
-
```typescript
|
|
255
|
-
import { ExuluChunkers } from "@exulu/backend";
|
|
256
|
-
|
|
257
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
258
|
-
levels: [...] // Array of RecursiveLevelData
|
|
259
|
-
});
|
|
260
|
-
```
|
|
261
|
-
|
|
262
|
-
### Levels
|
|
263
|
-
|
|
264
|
-
<ParamField path="levels" type="RecursiveLevelData[]">
|
|
265
|
-
Array of recursive levels defining the splitting hierarchy
|
|
266
|
-
</ParamField>
|
|
267
|
-
|
|
268
|
-
```typescript
|
|
269
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
270
|
-
levels: [
|
|
271
|
-
{ delimiters: ["\n\n"] },
|
|
272
|
-
{ delimiters: [". "] },
|
|
273
|
-
{ whitespace: true }
|
|
274
|
-
]
|
|
275
|
-
});
|
|
276
|
-
```
|
|
277
|
-
|
|
278
|
-
Each level is a `RecursiveLevelData` object with:
|
|
279
|
-
|
|
280
|
-
<ParamField path="delimiters" type="string | string[]">
|
|
281
|
-
Delimiter(s) to use for splitting at this level
|
|
282
|
-
</ParamField>
|
|
283
|
-
|
|
284
|
-
```typescript
|
|
285
|
-
// Single delimiter
|
|
286
|
-
{ delimiters: "\n\n" }
|
|
287
|
-
|
|
288
|
-
// Multiple delimiters
|
|
289
|
-
{ delimiters: [". ", "! ", "? "] }
|
|
290
|
-
```
|
|
291
|
-
|
|
292
|
-
<ParamField path="whitespace" type="boolean" default={false}>
|
|
293
|
-
Whether to split on whitespace at this level (default: false)
|
|
294
|
-
</ParamField>
|
|
295
|
-
|
|
296
|
-
```typescript
|
|
297
|
-
// Split on any whitespace character
|
|
298
|
-
{ whitespace: true }
|
|
299
|
-
```
|
|
300
|
-
|
|
301
|
-
<Warning>
|
|
302
|
-
Cannot use both `delimiters` and `whitespace` in the same level. They are mutually exclusive.
|
|
303
|
-
</Warning>
|
|
304
|
-
|
|
305
|
-
<ParamField path="includeDelim" type="'prev' | 'next'" default="prev">
|
|
306
|
-
Whether to include the delimiter in the previous or next chunk (default: "prev")
|
|
307
|
-
</ParamField>
|
|
308
|
-
|
|
309
|
-
```typescript
|
|
310
|
-
// Delimiter stays with previous chunk
|
|
311
|
-
{ delimiters: [". "], includeDelim: "prev" }
|
|
312
|
-
|
|
313
|
-
// Delimiter moves to next chunk
|
|
314
|
-
{ delimiters: ["\n## "], includeDelim: "next" }
|
|
315
|
-
```
|
|
316
|
-
|
|
317
|
-
**Use cases:**
|
|
318
|
-
- `"prev"`: For punctuation (sentences keep their periods)
|
|
319
|
-
- `"next"`: For headers (headers stay with their content)
|
|
320
|
-
|
|
321
|
-
### RecursiveLevel examples
|
|
322
|
-
|
|
323
|
-
<Tabs>
|
|
324
|
-
<Tab title="Markdown documents">
|
|
325
|
-
```typescript
|
|
326
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
327
|
-
levels: [
|
|
328
|
-
// Split by headers (keep header with content)
|
|
329
|
-
{
|
|
330
|
-
delimiters: ["\n# ", "\n## ", "\n### "],
|
|
331
|
-
includeDelim: "next"
|
|
332
|
-
},
|
|
333
|
-
// Split by paragraphs
|
|
334
|
-
{ delimiters: ["\n\n"] },
|
|
335
|
-
// Split by sentences
|
|
336
|
-
{ delimiters: [". ", "! ", "? "] },
|
|
337
|
-
// Split by words
|
|
338
|
-
{ whitespace: true }
|
|
339
|
-
]
|
|
340
|
-
});
|
|
341
|
-
```
|
|
342
|
-
</Tab>
|
|
343
|
-
|
|
344
|
-
<Tab title="Code documentation">
|
|
345
|
-
```typescript
|
|
346
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
347
|
-
levels: [
|
|
348
|
-
// Split by code blocks
|
|
349
|
-
{ delimiters: ["```"] },
|
|
350
|
-
// Split by sections
|
|
351
|
-
{ delimiters: ["\n\n"] },
|
|
352
|
-
// Split by list items
|
|
353
|
-
{ delimiters: ["\n- ", "\n* ", "\n1. "] },
|
|
354
|
-
// Split by sentences
|
|
355
|
-
{ delimiters: [". "] },
|
|
356
|
-
// Split by words
|
|
357
|
-
{ whitespace: true }
|
|
358
|
-
]
|
|
359
|
-
});
|
|
360
|
-
```
|
|
361
|
-
</Tab>
|
|
362
|
-
|
|
363
|
-
<Tab title="Structured data">
|
|
364
|
-
```typescript
|
|
365
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
366
|
-
levels: [
|
|
367
|
-
// Split by JSON objects
|
|
368
|
-
{ delimiters: ["},\n"] },
|
|
369
|
-
// Split by object properties
|
|
370
|
-
{ delimiters: [",\n "] },
|
|
371
|
-
// Split by words
|
|
372
|
-
{ whitespace: true }
|
|
373
|
-
]
|
|
374
|
-
});
|
|
375
|
-
```
|
|
376
|
-
</Tab>
|
|
377
|
-
|
|
378
|
-
<Tab title="Minimal (aggressive)">
|
|
379
|
-
```typescript
|
|
380
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
381
|
-
levels: [
|
|
382
|
-
// Only paragraphs and words
|
|
383
|
-
{ delimiters: ["\n\n"] },
|
|
384
|
-
{ whitespace: true }
|
|
385
|
-
]
|
|
386
|
-
});
|
|
387
|
-
```
|
|
388
|
-
</Tab>
|
|
389
|
-
</Tabs>
|
|
390
|
-
|
|
391
|
-
## Configuration patterns
|
|
392
|
-
|
|
393
|
-
### Natural language documents
|
|
394
|
-
|
|
395
|
-
```typescript
|
|
396
|
-
// Optimized for articles, blog posts, documentation
|
|
397
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
398
|
-
chunkSize: 512,
|
|
399
|
-
chunkOverlap: 75, // 15% overlap
|
|
400
|
-
minSentencesPerChunk: 2, // At least 2 sentences
|
|
401
|
-
minCharactersPerSentence: 15
|
|
402
|
-
});
|
|
403
|
-
```
|
|
404
|
-
|
|
405
|
-
**Why this works:**
|
|
406
|
-
- 512 tokens fits most embedding models
|
|
407
|
-
- 15% overlap preserves context
|
|
408
|
-
- Minimum 2 sentences ensures coherence
|
|
409
|
-
- 15 char minimum filters fragments
|
|
410
|
-
|
|
411
|
-
### Technical documentation
|
|
412
|
-
|
|
413
|
-
```typescript
|
|
414
|
-
// Optimized for API docs, guides, tutorials
|
|
415
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
416
|
-
levels: [
|
|
417
|
-
{ delimiters: ["\n## ", "\n### "] }, // Headers
|
|
418
|
-
{ delimiters: ["```"] }, // Code blocks
|
|
419
|
-
{ delimiters: ["\n\n"] }, // Paragraphs
|
|
420
|
-
{ delimiters: [". "] }, // Sentences
|
|
421
|
-
{ whitespace: true } // Words
|
|
422
|
-
]
|
|
423
|
-
});
|
|
424
|
-
|
|
425
|
-
const chunker = await ExuluChunkers.recursive.function.create({
|
|
426
|
-
chunkSize: 1024, // Larger for code examples
|
|
427
|
-
rules: rules,
|
|
428
|
-
minCharactersPerChunk: 100
|
|
429
|
-
});
|
|
430
|
-
```
|
|
431
|
-
|
|
432
|
-
**Why this works:**
|
|
433
|
-
- Respects structural boundaries (headers, code)
|
|
434
|
-
- 1024 tokens accommodates code examples
|
|
435
|
-
- 100 char minimum ensures substantive chunks
|
|
436
|
-
|
|
437
|
-
### Long-form content
|
|
438
|
-
|
|
439
|
-
```typescript
|
|
440
|
-
// Optimized for books, papers, long articles
|
|
441
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
442
|
-
chunkSize: 768, // Larger chunks
|
|
443
|
-
chunkOverlap: 150, // ~20% overlap
|
|
444
|
-
minSentencesPerChunk: 3, // More context per chunk
|
|
445
|
-
minCharactersPerSentence: 20
|
|
446
|
-
});
|
|
447
|
-
```
|
|
448
|
-
|
|
449
|
-
**Why this works:**
|
|
450
|
-
- Larger chunks capture more context
|
|
451
|
-
- Higher overlap maintains narrative flow
|
|
452
|
-
- Minimum 3 sentences ensures coherence
|
|
453
|
-
- 20 char minimum ensures quality sentences
|
|
454
|
-
|
|
455
|
-
### High-precision search
|
|
456
|
-
|
|
457
|
-
```typescript
|
|
458
|
-
// Optimized for precise search results
|
|
459
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
460
|
-
chunkSize: 256, // Smaller chunks
|
|
461
|
-
chunkOverlap: 25, // ~10% overlap
|
|
462
|
-
minSentencesPerChunk: 1, // Allow single sentences
|
|
463
|
-
minCharactersPerSentence: 10
|
|
464
|
-
});
|
|
465
|
-
```
|
|
466
|
-
|
|
467
|
-
**Why this works:**
|
|
468
|
-
- Smaller chunks = more precise results
|
|
469
|
-
- Lower overlap = less redundancy
|
|
470
|
-
- Single sentences allowed for granularity
|
|
471
|
-
|
|
472
|
-
### Code files
|
|
473
|
-
|
|
474
|
-
```typescript
|
|
475
|
-
// Optimized for source code
|
|
476
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
477
|
-
levels: [
|
|
478
|
-
{ delimiters: ["\nclass ", "\nfunction ", "\nconst "] }, // Top-level declarations
|
|
479
|
-
{ delimiters: ["{\n", "}\n"] }, // Blocks
|
|
480
|
-
{ delimiters: ["\n"] }, // Lines
|
|
481
|
-
{ whitespace: true } // Words
|
|
482
|
-
]
|
|
483
|
-
});
|
|
484
|
-
|
|
485
|
-
const chunker = await ExuluChunkers.recursive.function.create({
|
|
486
|
-
chunkSize: 2048, // Larger for functions/classes
|
|
487
|
-
rules: rules,
|
|
488
|
-
minCharactersPerChunk: 50
|
|
489
|
-
});
|
|
490
|
-
```
|
|
491
|
-
|
|
492
|
-
**Why this works:**
|
|
493
|
-
- Respects code structure (functions, classes)
|
|
494
|
-
- Large chunks keep functions/methods together
|
|
495
|
-
- Line-level splitting for smaller units
|
|
496
|
-
|
|
497
|
-
## Tuning recommendations
|
|
498
|
-
|
|
499
|
-
### Start conservative
|
|
500
|
-
|
|
501
|
-
```typescript
|
|
502
|
-
// Begin with safe defaults
|
|
503
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
504
|
-
chunkSize: 512,
|
|
505
|
-
chunkOverlap: 50,
|
|
506
|
-
minSentencesPerChunk: 1,
|
|
507
|
-
minCharactersPerSentence: 10
|
|
508
|
-
});
|
|
509
|
-
|
|
510
|
-
// Test and adjust based on:
|
|
511
|
-
// - Search result quality
|
|
512
|
-
// - Chunk count and costs
|
|
513
|
-
// - Context preservation
|
|
514
|
-
```
|
|
515
|
-
|
|
516
|
-
### Monitor chunk statistics
|
|
517
|
-
|
|
518
|
-
```typescript
|
|
519
|
-
const chunks = await chunker(text);
|
|
520
|
-
|
|
521
|
-
const avgTokens = chunks.reduce((sum, c) => sum + c.tokenCount, 0) / chunks.length;
|
|
522
|
-
const maxTokens = Math.max(...chunks.map(c => c.tokenCount));
|
|
523
|
-
const minTokens = Math.min(...chunks.map(c => c.tokenCount));
|
|
524
|
-
|
|
525
|
-
console.log(`Chunks: ${chunks.length}`);
|
|
526
|
-
console.log(`Avg tokens: ${avgTokens.toFixed(2)}`);
|
|
527
|
-
console.log(`Max tokens: ${maxTokens}`);
|
|
528
|
-
console.log(`Min tokens: ${minTokens}`);
|
|
529
|
-
|
|
530
|
-
// Adjust configuration based on statistics
|
|
531
|
-
```
|
|
532
|
-
|
|
533
|
-
### Test with real data
|
|
534
|
-
|
|
535
|
-
```typescript
|
|
536
|
-
// Sample representative documents
|
|
537
|
-
const sampleDocs = [
|
|
538
|
-
"Short article content...",
|
|
539
|
-
"Medium length blog post...",
|
|
540
|
-
"Very long technical documentation..."
|
|
541
|
-
];
|
|
542
|
-
|
|
543
|
-
// Test chunking
|
|
544
|
-
for (const doc of sampleDocs) {
|
|
545
|
-
const chunks = await chunker(doc);
|
|
546
|
-
console.log(`Doc length: ${doc.length} chars`);
|
|
547
|
-
console.log(`Chunks: ${chunks.length}`);
|
|
548
|
-
console.log(`Avg chunk: ${doc.length / chunks.length} chars`);
|
|
549
|
-
console.log("---");
|
|
550
|
-
}
|
|
551
|
-
|
|
552
|
-
// Adjust based on results
|
|
553
|
-
```
|
|
554
|
-
|
|
555
|
-
## Common pitfalls
|
|
556
|
-
|
|
557
|
-
<Warning>
|
|
558
|
-
**Overlap >= chunk size**: The chunker will throw an error. Ensure `chunkOverlap < chunkSize`.
|
|
559
|
-
</Warning>
|
|
560
|
-
|
|
561
|
-
<Warning>
|
|
562
|
-
**Chunk size too small**: Very small chunks lose context. Minimum recommended: 128 tokens.
|
|
563
|
-
</Warning>
|
|
564
|
-
|
|
565
|
-
<Warning>
|
|
566
|
-
**No overlap on narrative content**: Natural language benefits from overlap. Use 10-20% overlap for continuity.
|
|
567
|
-
</Warning>
|
|
568
|
-
|
|
569
|
-
<Warning>
|
|
570
|
-
**Wrong chunker for content type**: Use SentenceChunker for natural language, RecursiveChunker for structured content.
|
|
571
|
-
</Warning>
|
|
572
|
-
|
|
573
|
-
## Best practices
|
|
574
|
-
|
|
575
|
-
<Tip>
|
|
576
|
-
**Match embedding model**: Set `chunkSize` to 60-80% of your embedding model's token limit to leave room for metadata.
|
|
577
|
-
</Tip>
|
|
578
|
-
|
|
579
|
-
<Note>
|
|
580
|
-
**Use overlap for RAG**: Overlap improves retrieval quality in RAG systems by ensuring context isn't lost at boundaries.
|
|
581
|
-
</Note>
|
|
582
|
-
|
|
583
|
-
<Info>
|
|
584
|
-
**Custom rules for domain content**: If your documents have consistent structure (e.g., legal docs, medical records), create custom RecursiveRules to respect that structure.
|
|
585
|
-
</Info>
|
|
586
|
-
|
|
587
|
-
## Next steps
|
|
588
|
-
|
|
589
|
-
<CardGroup cols={2}>
|
|
590
|
-
<Card title="API reference" icon="code" href="/core/exulu-chunkers/api-reference">
|
|
591
|
-
Explore methods and properties
|
|
592
|
-
</Card>
|
|
593
|
-
<Card title="Overview" icon="book" href="/core/exulu-chunkers/introduction">
|
|
594
|
-
Learn about chunking concepts
|
|
595
|
-
</Card>
|
|
596
|
-
</CardGroup>
|