@exulu/backend 1.48.2 → 1.49.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +351 -42
- package/dist/index.d.cts +96 -1
- package/dist/index.d.ts +96 -1
- package/dist/index.js +340 -38
- package/ee/{markdown.ts → chunking/markdown.ts} +2 -2
- package/ee/python/README.md +295 -0
- package/ee/python/documents/processing/README.md +155 -0
- package/ee/{documents → python/documents}/processing/doc_processor.ts +25 -17
- package/ee/{documents/processing/pdf_to_markdown.py → python/documents/processing/document_to_markdown.py} +3 -10
- package/ee/python/setup.sh +180 -0
- package/package.json +14 -3
- package/scripts/postinstall.cjs +149 -0
- package/.agents/skills/mintlify/SKILL.md +0 -347
- package/.editorconfig +0 -15
- package/.eslintrc.json +0 -52
- package/.github/workflows/release-backend.yml +0 -38
- package/.husky/commit-msg +0 -1
- package/.jscpd.json +0 -18
- package/.mcp.json +0 -25
- package/.nvmrc +0 -1
- package/.prettierignore +0 -5
- package/.prettierrc.json +0 -12
- package/CHANGELOG.md +0 -8
- package/SECURITY.md +0 -5
- package/commitlint.config.js +0 -4
- package/devops/documentation/patch-older-releases.md +0 -42
- package/ee/documents/processing/build_pdf_processor.sh +0 -35
- package/ee/documents/processing/chunk_markdown.py +0 -263
- package/ee/documents/processing/pdf_processor.spec +0 -115
- package/eslint.config.js +0 -88
- package/jest.config.ts +0 -25
- package/mintlify-docs/.mintignore +0 -7
- package/mintlify-docs/AGENTS.md +0 -33
- package/mintlify-docs/CLAUDE.MD +0 -50
- package/mintlify-docs/CONTRIBUTING.md +0 -32
- package/mintlify-docs/LICENSE +0 -21
- package/mintlify-docs/README.md +0 -55
- package/mintlify-docs/ai-tools/claude-code.mdx +0 -43
- package/mintlify-docs/ai-tools/cursor.mdx +0 -39
- package/mintlify-docs/ai-tools/windsurf.mdx +0 -39
- package/mintlify-docs/api-reference/core-types/agent-types.mdx +0 -110
- package/mintlify-docs/api-reference/core-types/analytics-types.mdx +0 -95
- package/mintlify-docs/api-reference/core-types/configuration-types.mdx +0 -83
- package/mintlify-docs/api-reference/core-types/evaluation-types.mdx +0 -106
- package/mintlify-docs/api-reference/core-types/job-types.mdx +0 -135
- package/mintlify-docs/api-reference/core-types/overview.mdx +0 -73
- package/mintlify-docs/api-reference/core-types/prompt-types.mdx +0 -102
- package/mintlify-docs/api-reference/core-types/rbac-types.mdx +0 -163
- package/mintlify-docs/api-reference/core-types/session-types.mdx +0 -77
- package/mintlify-docs/api-reference/core-types/user-management.mdx +0 -112
- package/mintlify-docs/api-reference/core-types/workflow-types.mdx +0 -88
- package/mintlify-docs/api-reference/core-types.mdx +0 -585
- package/mintlify-docs/api-reference/dynamic-types.mdx +0 -851
- package/mintlify-docs/api-reference/endpoint/create.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/delete.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/get.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/webhook.mdx +0 -4
- package/mintlify-docs/api-reference/introduction.mdx +0 -661
- package/mintlify-docs/api-reference/mutations.mdx +0 -1012
- package/mintlify-docs/api-reference/openapi.json +0 -217
- package/mintlify-docs/api-reference/queries.mdx +0 -1154
- package/mintlify-docs/backend/introduction.mdx +0 -218
- package/mintlify-docs/changelog.mdx +0 -387
- package/mintlify-docs/community-edition.mdx +0 -304
- package/mintlify-docs/core/exulu-agent/api-reference.mdx +0 -894
- package/mintlify-docs/core/exulu-agent/configuration.mdx +0 -690
- package/mintlify-docs/core/exulu-agent/introduction.mdx +0 -552
- package/mintlify-docs/core/exulu-app/api-reference.mdx +0 -481
- package/mintlify-docs/core/exulu-app/configuration.mdx +0 -319
- package/mintlify-docs/core/exulu-app/introduction.mdx +0 -117
- package/mintlify-docs/core/exulu-authentication.mdx +0 -810
- package/mintlify-docs/core/exulu-chunkers/api-reference.mdx +0 -1011
- package/mintlify-docs/core/exulu-chunkers/configuration.mdx +0 -596
- package/mintlify-docs/core/exulu-chunkers/introduction.mdx +0 -403
- package/mintlify-docs/core/exulu-context/api-reference.mdx +0 -911
- package/mintlify-docs/core/exulu-context/configuration.mdx +0 -648
- package/mintlify-docs/core/exulu-context/introduction.mdx +0 -394
- package/mintlify-docs/core/exulu-database.mdx +0 -811
- package/mintlify-docs/core/exulu-default-agents.mdx +0 -545
- package/mintlify-docs/core/exulu-eval/api-reference.mdx +0 -772
- package/mintlify-docs/core/exulu-eval/configuration.mdx +0 -680
- package/mintlify-docs/core/exulu-eval/introduction.mdx +0 -459
- package/mintlify-docs/core/exulu-logging.mdx +0 -464
- package/mintlify-docs/core/exulu-otel.mdx +0 -670
- package/mintlify-docs/core/exulu-queues/api-reference.mdx +0 -648
- package/mintlify-docs/core/exulu-queues/configuration.mdx +0 -650
- package/mintlify-docs/core/exulu-queues/introduction.mdx +0 -474
- package/mintlify-docs/core/exulu-reranker/api-reference.mdx +0 -630
- package/mintlify-docs/core/exulu-reranker/configuration.mdx +0 -663
- package/mintlify-docs/core/exulu-reranker/introduction.mdx +0 -516
- package/mintlify-docs/core/exulu-tool/api-reference.mdx +0 -723
- package/mintlify-docs/core/exulu-tool/configuration.mdx +0 -805
- package/mintlify-docs/core/exulu-tool/introduction.mdx +0 -539
- package/mintlify-docs/core/exulu-variables/api-reference.mdx +0 -699
- package/mintlify-docs/core/exulu-variables/configuration.mdx +0 -736
- package/mintlify-docs/core/exulu-variables/introduction.mdx +0 -511
- package/mintlify-docs/development.mdx +0 -94
- package/mintlify-docs/docs.json +0 -248
- package/mintlify-docs/enterprise-edition.mdx +0 -538
- package/mintlify-docs/essentials/code.mdx +0 -35
- package/mintlify-docs/essentials/images.mdx +0 -59
- package/mintlify-docs/essentials/markdown.mdx +0 -88
- package/mintlify-docs/essentials/navigation.mdx +0 -87
- package/mintlify-docs/essentials/reusable-snippets.mdx +0 -110
- package/mintlify-docs/essentials/settings.mdx +0 -318
- package/mintlify-docs/favicon.svg +0 -3
- package/mintlify-docs/frontend/introduction.mdx +0 -39
- package/mintlify-docs/getting-started.mdx +0 -267
- package/mintlify-docs/guides/custom-agent.mdx +0 -608
- package/mintlify-docs/guides/first-agent.mdx +0 -315
- package/mintlify-docs/images/admin_ui.png +0 -0
- package/mintlify-docs/images/contexts.png +0 -0
- package/mintlify-docs/images/create_agents.png +0 -0
- package/mintlify-docs/images/evals.png +0 -0
- package/mintlify-docs/images/graphql.png +0 -0
- package/mintlify-docs/images/graphql_api.png +0 -0
- package/mintlify-docs/images/hero-dark.png +0 -0
- package/mintlify-docs/images/hero-light.png +0 -0
- package/mintlify-docs/images/hero.png +0 -0
- package/mintlify-docs/images/knowledge_sources.png +0 -0
- package/mintlify-docs/images/mcp.png +0 -0
- package/mintlify-docs/images/scaling.png +0 -0
- package/mintlify-docs/index.mdx +0 -411
- package/mintlify-docs/logo/dark.svg +0 -9
- package/mintlify-docs/logo/light.svg +0 -9
- package/mintlify-docs/partners.mdx +0 -558
- package/mintlify-docs/products.mdx +0 -77
- package/mintlify-docs/snippets/snippet-intro.mdx +0 -4
- package/mintlify-docs/styles.css +0 -207
- package/ngrok.bash +0 -1
- package/ngrok.md +0 -6
- package/ngrok.yml +0 -10
- package/release.config.cjs +0 -15
- package/skills-lock.json +0 -10
- package/types/context-processor.ts +0 -45
- package/types/enums/eval-types.ts +0 -5
- package/types/enums/field-types.ts +0 -1
- package/types/enums/jobs.ts +0 -11
- package/types/enums/statistics.ts +0 -13
- package/types/exulu-table-definition.ts +0 -79
- package/types/file-types.ts +0 -18
- package/types/models/agent-session.ts +0 -27
- package/types/models/agent.ts +0 -68
- package/types/models/context.ts +0 -53
- package/types/models/embedding.ts +0 -17
- package/types/models/eval-run.ts +0 -40
- package/types/models/exulu-agent-tool-config.ts +0 -11
- package/types/models/item.ts +0 -21
- package/types/models/job.ts +0 -8
- package/types/models/project.ts +0 -16
- package/types/models/rate-limiter-rules.ts +0 -7
- package/types/models/test-case.ts +0 -25
- package/types/models/tool.ts +0 -9
- package/types/models/user-role.ts +0 -12
- package/types/models/user.ts +0 -20
- package/types/models/variable.ts +0 -8
- package/types/models/vector-methods.ts +0 -7
- package/types/provider-config.ts +0 -21
- package/types/queue-config.ts +0 -16
- package/types/rbac-rights-modes.ts +0 -1
- package/types/statistics.ts +0 -20
- package/types/workflow.ts +0 -31
- /package/ee/{documents → python/documents}/THIRD_PARTY_LICENSES/docling.txt +0 -0
- /package/ee/{documents/processing → python}/requirements.txt +0 -0
|
@@ -1,403 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
title: "Overview"
|
|
3
|
-
description: "Text chunking utilities for splitting documents into semantic segments"
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
## Overview
|
|
7
|
-
|
|
8
|
-
`ExuluChunkers` provides text chunking utilities that split large documents into smaller, meaningful segments for embedding generation and semantic search. The package includes two specialized chunking strategies: sentence-based chunking for natural language text and recursive chunking for hierarchical document structures.
|
|
9
|
-
|
|
10
|
-
## Key features
|
|
11
|
-
|
|
12
|
-
<CardGroup cols={2}>
|
|
13
|
-
<Card title="Sentence chunking" icon="paragraph">
|
|
14
|
-
Splits text into sentence-based chunks with configurable overlap
|
|
15
|
-
</Card>
|
|
16
|
-
<Card title="Recursive chunking" icon="layer-group">
|
|
17
|
-
Hierarchical chunking with customizable splitting rules
|
|
18
|
-
</Card>
|
|
19
|
-
<Card title="Token-aware" icon="hashtag">
|
|
20
|
-
Built-in tokenizer respects token limits for embeddings
|
|
21
|
-
</Card>
|
|
22
|
-
<Card title="Configurable overlap" icon="arrows-left-right">
|
|
23
|
-
Control chunk overlap for better context preservation
|
|
24
|
-
</Card>
|
|
25
|
-
<Card title="Callable interface" icon="code">
|
|
26
|
-
Chunkers are callable functions for intuitive usage
|
|
27
|
-
</Card>
|
|
28
|
-
<Card title="Factory pattern" icon="wrench">
|
|
29
|
-
Async initialization with `.create()` method
|
|
30
|
-
</Card>
|
|
31
|
-
</CardGroup>
|
|
32
|
-
|
|
33
|
-
## Why chunking matters
|
|
34
|
-
|
|
35
|
-
When working with large documents and language models, chunking is essential:
|
|
36
|
-
|
|
37
|
-
<AccordionGroup>
|
|
38
|
-
<Accordion title="Token limits">
|
|
39
|
-
Embedding models have token limits (e.g., 8,192 for text-embedding-3-small). Chunking ensures text fits within these limits while preserving semantic coherence.
|
|
40
|
-
</Accordion>
|
|
41
|
-
|
|
42
|
-
<Accordion title="Search granularity">
|
|
43
|
-
Smaller chunks provide more precise search results. Instead of returning an entire document, users get the specific paragraph or section relevant to their query.
|
|
44
|
-
</Accordion>
|
|
45
|
-
|
|
46
|
-
<Accordion title="Context preservation">
|
|
47
|
-
Chunk overlap ensures important context isn't lost at boundaries. When a sentence is split across chunks, overlap captures the complete thought.
|
|
48
|
-
</Accordion>
|
|
49
|
-
|
|
50
|
-
<Accordion title="Embedding quality">
|
|
51
|
-
Semantically coherent chunks produce better embeddings. Sentence-based and hierarchical chunking maintain natural text boundaries.
|
|
52
|
-
</Accordion>
|
|
53
|
-
</AccordionGroup>
|
|
54
|
-
|
|
55
|
-
## Available chunkers
|
|
56
|
-
|
|
57
|
-
### SentenceChunker
|
|
58
|
-
|
|
59
|
-
Splits text into chunks at sentence boundaries, respecting token limits:
|
|
60
|
-
|
|
61
|
-
```typescript
|
|
62
|
-
import { ExuluChunkers } from "@exulu/backend";
|
|
63
|
-
|
|
64
|
-
// Create sentence chunker
|
|
65
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
66
|
-
chunkSize: 512, // Max 512 tokens per chunk
|
|
67
|
-
chunkOverlap: 50, // 50 tokens overlap between chunks
|
|
68
|
-
minSentencesPerChunk: 1,
|
|
69
|
-
minCharactersPerSentence: 10
|
|
70
|
-
});
|
|
71
|
-
|
|
72
|
-
// Chunk text
|
|
73
|
-
const chunks = await chunker("Your long document text here...");
|
|
74
|
-
|
|
75
|
-
console.log(chunks.length); // Number of chunks
|
|
76
|
-
console.log(chunks[0].text); // First chunk text
|
|
77
|
-
console.log(chunks[0].tokenCount); // Token count
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
**Use SentenceChunker for:**
|
|
81
|
-
- Natural language documents (articles, blog posts, documentation)
|
|
82
|
-
- Text where sentence boundaries are important
|
|
83
|
-
- Content that benefits from grammatical coherence
|
|
84
|
-
|
|
85
|
-
### RecursiveChunker
|
|
86
|
-
|
|
87
|
-
Hierarchically splits text using customizable rules (paragraphs → sentences → pauses → words → tokens):
|
|
88
|
-
|
|
89
|
-
```typescript
|
|
90
|
-
import { ExuluChunkers } from "@exulu/backend";
|
|
91
|
-
|
|
92
|
-
// Create recursive chunker with default rules
|
|
93
|
-
const chunker = await ExuluChunkers.recursive.function.create({
|
|
94
|
-
chunkSize: 1024, // Max 1024 tokens per chunk
|
|
95
|
-
minCharactersPerChunk: 50
|
|
96
|
-
});
|
|
97
|
-
|
|
98
|
-
// Or with custom rules
|
|
99
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
100
|
-
levels: [
|
|
101
|
-
{ delimiters: ["\n\n", "\n"] }, // Split by paragraphs
|
|
102
|
-
{ delimiters: [". ", "! ", "? "] }, // Then sentences
|
|
103
|
-
{ whitespace: true } // Then words
|
|
104
|
-
]
|
|
105
|
-
});
|
|
106
|
-
|
|
107
|
-
const customChunker = await ExuluChunkers.recursive.function.create({
|
|
108
|
-
chunkSize: 1024,
|
|
109
|
-
rules: rules
|
|
110
|
-
});
|
|
111
|
-
|
|
112
|
-
// Chunk text
|
|
113
|
-
const chunks = await customChunker("Your document...");
|
|
114
|
-
|
|
115
|
-
console.log(chunks[0].level); // Recursion level used for this chunk
|
|
116
|
-
```
|
|
117
|
-
|
|
118
|
-
**Use RecursiveChunker for:**
|
|
119
|
-
- Code documentation with hierarchical structure
|
|
120
|
-
- Markdown documents with headers and sections
|
|
121
|
-
- Content with clear structural delimiters
|
|
122
|
-
- When you need control over splitting priorities
|
|
123
|
-
|
|
124
|
-
## Chunking workflow
|
|
125
|
-
|
|
126
|
-
<Steps>
|
|
127
|
-
<Step title="Create chunker">
|
|
128
|
-
Initialize with `.create()` and configuration options
|
|
129
|
-
|
|
130
|
-
```typescript
|
|
131
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
132
|
-
chunkSize: 512,
|
|
133
|
-
chunkOverlap: 50
|
|
134
|
-
});
|
|
135
|
-
```
|
|
136
|
-
</Step>
|
|
137
|
-
|
|
138
|
-
<Step title="Chunk text">
|
|
139
|
-
Call the chunker as a function with your text
|
|
140
|
-
|
|
141
|
-
```typescript
|
|
142
|
-
const chunks = await chunker(documentText);
|
|
143
|
-
```
|
|
144
|
-
</Step>
|
|
145
|
-
|
|
146
|
-
<Step title="Process chunks">
|
|
147
|
-
Iterate through chunks and use them in your application
|
|
148
|
-
|
|
149
|
-
```typescript
|
|
150
|
-
for (const chunk of chunks) {
|
|
151
|
-
console.log(chunk.text);
|
|
152
|
-
console.log(chunk.tokenCount);
|
|
153
|
-
console.log(chunk.startIndex, chunk.endIndex);
|
|
154
|
-
}
|
|
155
|
-
```
|
|
156
|
-
</Step>
|
|
157
|
-
|
|
158
|
-
<Step title="Generate embeddings">
|
|
159
|
-
Pass chunks to your embedder for vector generation
|
|
160
|
-
|
|
161
|
-
```typescript
|
|
162
|
-
const embeddings = await embedder.generate(
|
|
163
|
-
chunks.map(c => c.text)
|
|
164
|
-
);
|
|
165
|
-
```
|
|
166
|
-
</Step>
|
|
167
|
-
</Steps>
|
|
168
|
-
|
|
169
|
-
## Quick comparison
|
|
170
|
-
|
|
171
|
-
| Feature | SentenceChunker | RecursiveChunker |
|
|
172
|
-
|---------|----------------|------------------|
|
|
173
|
-
| **Strategy** | Sentence boundaries | Hierarchical rules |
|
|
174
|
-
| **Overlap** | ✅ Configurable | ❌ No overlap |
|
|
175
|
-
| **Best for** | Natural language | Structured documents |
|
|
176
|
-
| **Customization** | Minimal | Extensive via rules |
|
|
177
|
-
| **Complexity** | Simple | Advanced |
|
|
178
|
-
| **Level tracking** | ❌ No | ✅ Yes |
|
|
179
|
-
|
|
180
|
-
## Integration with ExuluContext
|
|
181
|
-
|
|
182
|
-
ExuluChunkers are designed to work with ExuluContext for semantic search:
|
|
183
|
-
|
|
184
|
-
```typescript
|
|
185
|
-
import { ExuluContext, ExuluChunkers, ExuluEmbedder } from "@exulu/backend";
|
|
186
|
-
|
|
187
|
-
// Create chunker
|
|
188
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
189
|
-
chunkSize: 512,
|
|
190
|
-
chunkOverlap: 50
|
|
191
|
-
});
|
|
192
|
-
|
|
193
|
-
// Create context with chunker
|
|
194
|
-
const context = new ExuluContext({
|
|
195
|
-
id: "docs",
|
|
196
|
-
name: "Documentation",
|
|
197
|
-
description: "Product documentation search",
|
|
198
|
-
embedder: embedder,
|
|
199
|
-
chunker: chunker, // Use the chunker
|
|
200
|
-
fields: [
|
|
201
|
-
{ name: "title", type: "text", required: true },
|
|
202
|
-
{ name: "content", type: "longtext", required: true }
|
|
203
|
-
],
|
|
204
|
-
sources: []
|
|
205
|
-
});
|
|
206
|
-
|
|
207
|
-
// Documents are automatically chunked during insertion
|
|
208
|
-
await context.createItem(
|
|
209
|
-
{
|
|
210
|
-
title: "Getting Started",
|
|
211
|
-
content: "Very long documentation content..."
|
|
212
|
-
},
|
|
213
|
-
{ generateEmbeddings: true }
|
|
214
|
-
);
|
|
215
|
-
```
|
|
216
|
-
|
|
217
|
-
## Chunk structure
|
|
218
|
-
|
|
219
|
-
All chunkers return an array of `Chunk` objects:
|
|
220
|
-
|
|
221
|
-
```typescript
|
|
222
|
-
type Chunk = {
|
|
223
|
-
text: string; // The chunk text
|
|
224
|
-
startIndex: number; // Start position in original text
|
|
225
|
-
endIndex: number; // End position in original text
|
|
226
|
-
tokenCount: number; // Number of tokens in chunk
|
|
227
|
-
embedding?: number[]; // Optional embedding vector
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
// RecursiveChunk extends Chunk with:
|
|
231
|
-
type RecursiveChunk = Chunk & {
|
|
232
|
-
level?: number; // Recursion level (0 = top level)
|
|
233
|
-
}
|
|
234
|
-
```
|
|
235
|
-
|
|
236
|
-
## Token counting
|
|
237
|
-
|
|
238
|
-
Both chunkers use `ExuluTokenizer` for accurate token counting:
|
|
239
|
-
|
|
240
|
-
```typescript
|
|
241
|
-
// Chunkers respect token limits, not character limits
|
|
242
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
243
|
-
chunkSize: 512 // 512 tokens, not characters
|
|
244
|
-
});
|
|
245
|
-
|
|
246
|
-
// Handles multi-byte characters correctly
|
|
247
|
-
const text = "Hello 世界! This is a test.";
|
|
248
|
-
const chunks = await chunker(text);
|
|
249
|
-
|
|
250
|
-
// Each chunk's tokenCount is accurate
|
|
251
|
-
console.log(chunks[0].tokenCount); // Actual token count
|
|
252
|
-
```
|
|
253
|
-
|
|
254
|
-
## Best practices
|
|
255
|
-
|
|
256
|
-
<Tip>
|
|
257
|
-
**Match embedding limits**: Set `chunkSize` based on your embedding model's token limit. Leave room for context (e.g., 512 tokens for a 1536-token limit).
|
|
258
|
-
</Tip>
|
|
259
|
-
|
|
260
|
-
<Note>
|
|
261
|
-
**Use overlap for continuity**: For natural language, use 10-20% overlap (e.g., 50-100 tokens for 512-token chunks) to preserve context across boundaries.
|
|
262
|
-
</Note>
|
|
263
|
-
|
|
264
|
-
<Warning>
|
|
265
|
-
**Validate chunk size**: Ensure `chunkSize` is larger than `chunkOverlap`. The chunker will throw an error if overlap equals or exceeds chunk size.
|
|
266
|
-
</Warning>
|
|
267
|
-
|
|
268
|
-
<Info>
|
|
269
|
-
**Choose the right chunker**: Use SentenceChunker for most text documents. Use RecursiveChunker when you need fine control over structural boundaries.
|
|
270
|
-
</Info>
|
|
271
|
-
|
|
272
|
-
## Performance considerations
|
|
273
|
-
|
|
274
|
-
- **Tokenization cost**: Token counting requires encoding text. For large documents, this adds processing time.
|
|
275
|
-
- **Chunk count**: Smaller chunks = more chunks = more embeddings = higher API costs and storage.
|
|
276
|
-
- **Overlap vs. accuracy**: Higher overlap improves context but increases chunk count and costs.
|
|
277
|
-
|
|
278
|
-
**Recommended chunk sizes:**
|
|
279
|
-
- Small documents (< 10K tokens): 256-512 tokens per chunk
|
|
280
|
-
- Medium documents (10K-100K tokens): 512-1024 tokens per chunk
|
|
281
|
-
- Large documents (> 100K tokens): 1024-2048 tokens per chunk
|
|
282
|
-
|
|
283
|
-
## Example: Chunking strategies
|
|
284
|
-
|
|
285
|
-
<Tabs>
|
|
286
|
-
<Tab title="Natural language">
|
|
287
|
-
```typescript
|
|
288
|
-
// Blog post, article, documentation
|
|
289
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
290
|
-
chunkSize: 512,
|
|
291
|
-
chunkOverlap: 50,
|
|
292
|
-
minSentencesPerChunk: 2, // At least 2 sentences per chunk
|
|
293
|
-
minCharactersPerSentence: 15
|
|
294
|
-
});
|
|
295
|
-
|
|
296
|
-
const text = `
|
|
297
|
-
Introduction to Machine Learning
|
|
298
|
-
|
|
299
|
-
Machine learning is a subset of AI. It enables computers
|
|
300
|
-
to learn from data without explicit programming.
|
|
301
|
-
|
|
302
|
-
Types of Learning
|
|
303
|
-
|
|
304
|
-
Supervised learning uses labeled data. Unsupervised
|
|
305
|
-
learning finds patterns in unlabeled data.
|
|
306
|
-
`;
|
|
307
|
-
|
|
308
|
-
const chunks = await chunker(text);
|
|
309
|
-
// Result: Chunks split at sentence boundaries
|
|
310
|
-
```
|
|
311
|
-
</Tab>
|
|
312
|
-
|
|
313
|
-
<Tab title="Code documentation">
|
|
314
|
-
```typescript
|
|
315
|
-
// Code documentation with clear structure
|
|
316
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
317
|
-
levels: [
|
|
318
|
-
{ delimiters: ["\n\n", "\n"] }, // Paragraphs
|
|
319
|
-
{ delimiters: ["```"] }, // Code blocks
|
|
320
|
-
{ delimiters: [". ", "! ", "? "] }, // Sentences
|
|
321
|
-
{ whitespace: true } // Words
|
|
322
|
-
]
|
|
323
|
-
});
|
|
324
|
-
|
|
325
|
-
const chunker = await ExuluChunkers.recursive.function.create({
|
|
326
|
-
chunkSize: 1024,
|
|
327
|
-
rules: rules,
|
|
328
|
-
minCharactersPerChunk: 50
|
|
329
|
-
});
|
|
330
|
-
|
|
331
|
-
const markdown = `
|
|
332
|
-
## Installation
|
|
333
|
-
|
|
334
|
-
Install via npm:
|
|
335
|
-
|
|
336
|
-
\`\`\`bash
|
|
337
|
-
npm install exulu-backend
|
|
338
|
-
\`\`\`
|
|
339
|
-
|
|
340
|
-
## Usage
|
|
341
|
-
|
|
342
|
-
Import and use:
|
|
343
|
-
|
|
344
|
-
\`\`\`typescript
|
|
345
|
-
import { ExuluContext } from "@exulu/backend";
|
|
346
|
-
\`\`\`
|
|
347
|
-
`;
|
|
348
|
-
|
|
349
|
-
const chunks = await chunker(markdown);
|
|
350
|
-
// Result: Chunks respect code block and header boundaries
|
|
351
|
-
```
|
|
352
|
-
</Tab>
|
|
353
|
-
|
|
354
|
-
<Tab title="Mixed content">
|
|
355
|
-
```typescript
|
|
356
|
-
// Content with various structures
|
|
357
|
-
const chunker = await ExuluChunkers.recursive.function.create({
|
|
358
|
-
chunkSize: 768,
|
|
359
|
-
minCharactersPerChunk: 100
|
|
360
|
-
});
|
|
361
|
-
|
|
362
|
-
const mixedContent = `
|
|
363
|
-
# API Reference
|
|
364
|
-
|
|
365
|
-
## authenticate()
|
|
366
|
-
|
|
367
|
-
Authenticates a user with credentials.
|
|
368
|
-
|
|
369
|
-
**Parameters:**
|
|
370
|
-
- username: string
|
|
371
|
-
- password: string
|
|
372
|
-
|
|
373
|
-
**Returns:** Promise<User>
|
|
374
|
-
|
|
375
|
-
**Example:**
|
|
376
|
-
|
|
377
|
-
\`\`\`typescript
|
|
378
|
-
const user = await authenticate("john", "secret");
|
|
379
|
-
\`\`\`
|
|
380
|
-
`;
|
|
381
|
-
|
|
382
|
-
const chunks = await chunker(mixedContent);
|
|
383
|
-
// Result: Hierarchical splitting maintains structure
|
|
384
|
-
```
|
|
385
|
-
</Tab>
|
|
386
|
-
</Tabs>
|
|
387
|
-
|
|
388
|
-
## Next steps
|
|
389
|
-
|
|
390
|
-
<CardGroup cols={2}>
|
|
391
|
-
<Card title="Configuration" icon="gear" href="/core/exulu-chunkers/configuration">
|
|
392
|
-
Learn about configuration options
|
|
393
|
-
</Card>
|
|
394
|
-
<Card title="API reference" icon="code" href="/core/exulu-chunkers/api-reference">
|
|
395
|
-
Explore methods and properties
|
|
396
|
-
</Card>
|
|
397
|
-
<Card title="ExuluContext" icon="database" href="/core/exulu-context/introduction">
|
|
398
|
-
Use chunkers with contexts
|
|
399
|
-
</Card>
|
|
400
|
-
<Card title="ExuluEmbedder" icon="vector-square" href="/core/exulu-embedder/introduction">
|
|
401
|
-
Generate embeddings for chunks
|
|
402
|
-
</Card>
|
|
403
|
-
</CardGroup>
|