@exulu/backend 1.48.2 → 1.49.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +351 -42
- package/dist/index.d.cts +96 -1
- package/dist/index.d.ts +96 -1
- package/dist/index.js +340 -38
- package/ee/{markdown.ts → chunking/markdown.ts} +2 -2
- package/ee/python/README.md +295 -0
- package/ee/python/documents/processing/README.md +155 -0
- package/ee/{documents → python/documents}/processing/doc_processor.ts +25 -17
- package/ee/{documents/processing/pdf_to_markdown.py → python/documents/processing/document_to_markdown.py} +3 -10
- package/ee/python/setup.sh +180 -0
- package/package.json +14 -3
- package/scripts/postinstall.cjs +149 -0
- package/.agents/skills/mintlify/SKILL.md +0 -347
- package/.editorconfig +0 -15
- package/.eslintrc.json +0 -52
- package/.github/workflows/release-backend.yml +0 -38
- package/.husky/commit-msg +0 -1
- package/.jscpd.json +0 -18
- package/.mcp.json +0 -25
- package/.nvmrc +0 -1
- package/.prettierignore +0 -5
- package/.prettierrc.json +0 -12
- package/CHANGELOG.md +0 -8
- package/SECURITY.md +0 -5
- package/commitlint.config.js +0 -4
- package/devops/documentation/patch-older-releases.md +0 -42
- package/ee/documents/processing/build_pdf_processor.sh +0 -35
- package/ee/documents/processing/chunk_markdown.py +0 -263
- package/ee/documents/processing/pdf_processor.spec +0 -115
- package/eslint.config.js +0 -88
- package/jest.config.ts +0 -25
- package/mintlify-docs/.mintignore +0 -7
- package/mintlify-docs/AGENTS.md +0 -33
- package/mintlify-docs/CLAUDE.MD +0 -50
- package/mintlify-docs/CONTRIBUTING.md +0 -32
- package/mintlify-docs/LICENSE +0 -21
- package/mintlify-docs/README.md +0 -55
- package/mintlify-docs/ai-tools/claude-code.mdx +0 -43
- package/mintlify-docs/ai-tools/cursor.mdx +0 -39
- package/mintlify-docs/ai-tools/windsurf.mdx +0 -39
- package/mintlify-docs/api-reference/core-types/agent-types.mdx +0 -110
- package/mintlify-docs/api-reference/core-types/analytics-types.mdx +0 -95
- package/mintlify-docs/api-reference/core-types/configuration-types.mdx +0 -83
- package/mintlify-docs/api-reference/core-types/evaluation-types.mdx +0 -106
- package/mintlify-docs/api-reference/core-types/job-types.mdx +0 -135
- package/mintlify-docs/api-reference/core-types/overview.mdx +0 -73
- package/mintlify-docs/api-reference/core-types/prompt-types.mdx +0 -102
- package/mintlify-docs/api-reference/core-types/rbac-types.mdx +0 -163
- package/mintlify-docs/api-reference/core-types/session-types.mdx +0 -77
- package/mintlify-docs/api-reference/core-types/user-management.mdx +0 -112
- package/mintlify-docs/api-reference/core-types/workflow-types.mdx +0 -88
- package/mintlify-docs/api-reference/core-types.mdx +0 -585
- package/mintlify-docs/api-reference/dynamic-types.mdx +0 -851
- package/mintlify-docs/api-reference/endpoint/create.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/delete.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/get.mdx +0 -4
- package/mintlify-docs/api-reference/endpoint/webhook.mdx +0 -4
- package/mintlify-docs/api-reference/introduction.mdx +0 -661
- package/mintlify-docs/api-reference/mutations.mdx +0 -1012
- package/mintlify-docs/api-reference/openapi.json +0 -217
- package/mintlify-docs/api-reference/queries.mdx +0 -1154
- package/mintlify-docs/backend/introduction.mdx +0 -218
- package/mintlify-docs/changelog.mdx +0 -387
- package/mintlify-docs/community-edition.mdx +0 -304
- package/mintlify-docs/core/exulu-agent/api-reference.mdx +0 -894
- package/mintlify-docs/core/exulu-agent/configuration.mdx +0 -690
- package/mintlify-docs/core/exulu-agent/introduction.mdx +0 -552
- package/mintlify-docs/core/exulu-app/api-reference.mdx +0 -481
- package/mintlify-docs/core/exulu-app/configuration.mdx +0 -319
- package/mintlify-docs/core/exulu-app/introduction.mdx +0 -117
- package/mintlify-docs/core/exulu-authentication.mdx +0 -810
- package/mintlify-docs/core/exulu-chunkers/api-reference.mdx +0 -1011
- package/mintlify-docs/core/exulu-chunkers/configuration.mdx +0 -596
- package/mintlify-docs/core/exulu-chunkers/introduction.mdx +0 -403
- package/mintlify-docs/core/exulu-context/api-reference.mdx +0 -911
- package/mintlify-docs/core/exulu-context/configuration.mdx +0 -648
- package/mintlify-docs/core/exulu-context/introduction.mdx +0 -394
- package/mintlify-docs/core/exulu-database.mdx +0 -811
- package/mintlify-docs/core/exulu-default-agents.mdx +0 -545
- package/mintlify-docs/core/exulu-eval/api-reference.mdx +0 -772
- package/mintlify-docs/core/exulu-eval/configuration.mdx +0 -680
- package/mintlify-docs/core/exulu-eval/introduction.mdx +0 -459
- package/mintlify-docs/core/exulu-logging.mdx +0 -464
- package/mintlify-docs/core/exulu-otel.mdx +0 -670
- package/mintlify-docs/core/exulu-queues/api-reference.mdx +0 -648
- package/mintlify-docs/core/exulu-queues/configuration.mdx +0 -650
- package/mintlify-docs/core/exulu-queues/introduction.mdx +0 -474
- package/mintlify-docs/core/exulu-reranker/api-reference.mdx +0 -630
- package/mintlify-docs/core/exulu-reranker/configuration.mdx +0 -663
- package/mintlify-docs/core/exulu-reranker/introduction.mdx +0 -516
- package/mintlify-docs/core/exulu-tool/api-reference.mdx +0 -723
- package/mintlify-docs/core/exulu-tool/configuration.mdx +0 -805
- package/mintlify-docs/core/exulu-tool/introduction.mdx +0 -539
- package/mintlify-docs/core/exulu-variables/api-reference.mdx +0 -699
- package/mintlify-docs/core/exulu-variables/configuration.mdx +0 -736
- package/mintlify-docs/core/exulu-variables/introduction.mdx +0 -511
- package/mintlify-docs/development.mdx +0 -94
- package/mintlify-docs/docs.json +0 -248
- package/mintlify-docs/enterprise-edition.mdx +0 -538
- package/mintlify-docs/essentials/code.mdx +0 -35
- package/mintlify-docs/essentials/images.mdx +0 -59
- package/mintlify-docs/essentials/markdown.mdx +0 -88
- package/mintlify-docs/essentials/navigation.mdx +0 -87
- package/mintlify-docs/essentials/reusable-snippets.mdx +0 -110
- package/mintlify-docs/essentials/settings.mdx +0 -318
- package/mintlify-docs/favicon.svg +0 -3
- package/mintlify-docs/frontend/introduction.mdx +0 -39
- package/mintlify-docs/getting-started.mdx +0 -267
- package/mintlify-docs/guides/custom-agent.mdx +0 -608
- package/mintlify-docs/guides/first-agent.mdx +0 -315
- package/mintlify-docs/images/admin_ui.png +0 -0
- package/mintlify-docs/images/contexts.png +0 -0
- package/mintlify-docs/images/create_agents.png +0 -0
- package/mintlify-docs/images/evals.png +0 -0
- package/mintlify-docs/images/graphql.png +0 -0
- package/mintlify-docs/images/graphql_api.png +0 -0
- package/mintlify-docs/images/hero-dark.png +0 -0
- package/mintlify-docs/images/hero-light.png +0 -0
- package/mintlify-docs/images/hero.png +0 -0
- package/mintlify-docs/images/knowledge_sources.png +0 -0
- package/mintlify-docs/images/mcp.png +0 -0
- package/mintlify-docs/images/scaling.png +0 -0
- package/mintlify-docs/index.mdx +0 -411
- package/mintlify-docs/logo/dark.svg +0 -9
- package/mintlify-docs/logo/light.svg +0 -9
- package/mintlify-docs/partners.mdx +0 -558
- package/mintlify-docs/products.mdx +0 -77
- package/mintlify-docs/snippets/snippet-intro.mdx +0 -4
- package/mintlify-docs/styles.css +0 -207
- package/ngrok.bash +0 -1
- package/ngrok.md +0 -6
- package/ngrok.yml +0 -10
- package/release.config.cjs +0 -15
- package/skills-lock.json +0 -10
- package/types/context-processor.ts +0 -45
- package/types/enums/eval-types.ts +0 -5
- package/types/enums/field-types.ts +0 -1
- package/types/enums/jobs.ts +0 -11
- package/types/enums/statistics.ts +0 -13
- package/types/exulu-table-definition.ts +0 -79
- package/types/file-types.ts +0 -18
- package/types/models/agent-session.ts +0 -27
- package/types/models/agent.ts +0 -68
- package/types/models/context.ts +0 -53
- package/types/models/embedding.ts +0 -17
- package/types/models/eval-run.ts +0 -40
- package/types/models/exulu-agent-tool-config.ts +0 -11
- package/types/models/item.ts +0 -21
- package/types/models/job.ts +0 -8
- package/types/models/project.ts +0 -16
- package/types/models/rate-limiter-rules.ts +0 -7
- package/types/models/test-case.ts +0 -25
- package/types/models/tool.ts +0 -9
- package/types/models/user-role.ts +0 -12
- package/types/models/user.ts +0 -20
- package/types/models/variable.ts +0 -8
- package/types/models/vector-methods.ts +0 -7
- package/types/provider-config.ts +0 -21
- package/types/queue-config.ts +0 -16
- package/types/rbac-rights-modes.ts +0 -1
- package/types/statistics.ts +0 -20
- package/types/workflow.ts +0 -31
- /package/ee/{documents → python/documents}/THIRD_PARTY_LICENSES/docling.txt +0 -0
- /package/ee/{documents/processing → python}/requirements.txt +0 -0
|
@@ -1,1011 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
title: "API reference"
|
|
3
|
-
description: "Complete method and property reference for ExuluChunkers"
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
## ExuluChunkers namespace
|
|
7
|
-
|
|
8
|
-
ExuluChunkers is exported as a namespace object:
|
|
9
|
-
|
|
10
|
-
```typescript
|
|
11
|
-
import { ExuluChunkers } from "@exulu/backend";
|
|
12
|
-
|
|
13
|
-
// Access sentence chunker
|
|
14
|
-
const sentenceChunker = await ExuluChunkers.sentence.create({...});
|
|
15
|
-
|
|
16
|
-
// Access recursive chunker
|
|
17
|
-
const recursiveChunker = await ExuluChunkers.recursive.function.create({...});
|
|
18
|
-
|
|
19
|
-
// Access recursive rules
|
|
20
|
-
const rules = new ExuluChunkers.recursive.rules({...});
|
|
21
|
-
```
|
|
22
|
-
|
|
23
|
-
## SentenceChunker
|
|
24
|
-
|
|
25
|
-
### create()
|
|
26
|
-
|
|
27
|
-
Factory method to create a new SentenceChunker instance.
|
|
28
|
-
|
|
29
|
-
```typescript
|
|
30
|
-
static async create(options: SentenceChunkerOptions): Promise<CallableSentenceChunker>
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
<ParamField path="options" type="SentenceChunkerOptions" required>
|
|
34
|
-
Configuration options for the chunker
|
|
35
|
-
</ParamField>
|
|
36
|
-
|
|
37
|
-
<ParamField path="options.chunkSize" type="number" required>
|
|
38
|
-
Maximum number of tokens per chunk
|
|
39
|
-
</ParamField>
|
|
40
|
-
|
|
41
|
-
<ParamField path="options.chunkOverlap" type="number" default={0}>
|
|
42
|
-
Number of tokens to overlap between chunks (default: 0)
|
|
43
|
-
</ParamField>
|
|
44
|
-
|
|
45
|
-
<ParamField path="options.minSentencesPerChunk" type="number" default={1}>
|
|
46
|
-
Minimum sentences per chunk (default: 1)
|
|
47
|
-
</ParamField>
|
|
48
|
-
|
|
49
|
-
<ParamField path="options.minCharactersPerSentence" type="number" default={10}>
|
|
50
|
-
Minimum character length for a sentence (default: 10)
|
|
51
|
-
</ParamField>
|
|
52
|
-
|
|
53
|
-
<ResponseField name="return" type="Promise<CallableSentenceChunker>">
|
|
54
|
-
A callable chunker function that can be invoked with text
|
|
55
|
-
</ResponseField>
|
|
56
|
-
|
|
57
|
-
```typescript
|
|
58
|
-
import { ExuluChunkers } from "@exulu/backend";
|
|
59
|
-
|
|
60
|
-
// Create chunker
|
|
61
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
62
|
-
chunkSize: 512,
|
|
63
|
-
chunkOverlap: 50,
|
|
64
|
-
minSentencesPerChunk: 2,
|
|
65
|
-
minCharactersPerSentence: 15
|
|
66
|
-
});
|
|
67
|
-
|
|
68
|
-
// Use chunker
|
|
69
|
-
const text = "Your document text here...";
|
|
70
|
-
const chunks = await chunker(text);
|
|
71
|
-
|
|
72
|
-
console.log(chunks.length); // Number of chunks
|
|
73
|
-
console.log(chunks[0].text); // First chunk text
|
|
74
|
-
console.log(chunks[0].tokenCount); // Token count
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
### CallableSentenceChunker
|
|
78
|
-
|
|
79
|
-
The chunker returned by `create()` is a callable function:
|
|
80
|
-
|
|
81
|
-
```typescript
|
|
82
|
-
async (text: string): Promise<Chunk[]>
|
|
83
|
-
```
|
|
84
|
-
|
|
85
|
-
<ParamField path="text" type="string" required>
|
|
86
|
-
The text to chunk
|
|
87
|
-
</ParamField>
|
|
88
|
-
|
|
89
|
-
<ResponseField name="return" type="Promise<Chunk[]>">
|
|
90
|
-
Array of Chunk objects
|
|
91
|
-
</ResponseField>
|
|
92
|
-
|
|
93
|
-
```typescript
|
|
94
|
-
const chunks = await chunker("Long text to chunk...");
|
|
95
|
-
|
|
96
|
-
for (const chunk of chunks) {
|
|
97
|
-
console.log(chunk.text);
|
|
98
|
-
console.log(chunk.tokenCount);
|
|
99
|
-
console.log(chunk.startIndex, chunk.endIndex);
|
|
100
|
-
}
|
|
101
|
-
```
|
|
102
|
-
|
|
103
|
-
### Properties
|
|
104
|
-
|
|
105
|
-
The callable chunker also has properties from the SentenceChunker class:
|
|
106
|
-
|
|
107
|
-
<ResponseField name="chunkSize" type="number">
|
|
108
|
-
Maximum tokens per chunk
|
|
109
|
-
</ResponseField>
|
|
110
|
-
|
|
111
|
-
<ResponseField name="chunkOverlap" type="number">
|
|
112
|
-
Overlap in tokens
|
|
113
|
-
</ResponseField>
|
|
114
|
-
|
|
115
|
-
<ResponseField name="minSentencesPerChunk" type="number">
|
|
116
|
-
Minimum sentences per chunk
|
|
117
|
-
</ResponseField>
|
|
118
|
-
|
|
119
|
-
<ResponseField name="minCharactersPerSentence" type="number">
|
|
120
|
-
Minimum characters per sentence
|
|
121
|
-
</ResponseField>
|
|
122
|
-
|
|
123
|
-
<ResponseField name="tokenizer" type="ExuluTokenizer">
|
|
124
|
-
The tokenizer instance used for counting tokens
|
|
125
|
-
</ResponseField>
|
|
126
|
-
|
|
127
|
-
```typescript
|
|
128
|
-
console.log(chunker.chunkSize); // 512
|
|
129
|
-
console.log(chunker.chunkOverlap); // 50
|
|
130
|
-
console.log(chunker.minSentencesPerChunk); // 2
|
|
131
|
-
```
|
|
132
|
-
|
|
133
|
-
## RecursiveChunker
|
|
134
|
-
|
|
135
|
-
### create()
|
|
136
|
-
|
|
137
|
-
Factory method to create a new RecursiveChunker instance.
|
|
138
|
-
|
|
139
|
-
```typescript
|
|
140
|
-
static async create(options: RecursiveChunkerOptions): Promise<CallableRecursiveChunker>
|
|
141
|
-
```
|
|
142
|
-
|
|
143
|
-
<ParamField path="options" type="RecursiveChunkerOptions" required>
|
|
144
|
-
Configuration options for the chunker
|
|
145
|
-
</ParamField>
|
|
146
|
-
|
|
147
|
-
<ParamField path="options.chunkSize" type="number" required>
|
|
148
|
-
Maximum number of tokens per chunk
|
|
149
|
-
</ParamField>
|
|
150
|
-
|
|
151
|
-
<ParamField path="options.rules" type="RecursiveRules" default="default rules">
|
|
152
|
-
Recursive splitting rules (default: paragraphs → sentences → pauses → words → tokens)
|
|
153
|
-
</ParamField>
|
|
154
|
-
|
|
155
|
-
<ParamField path="options.minCharactersPerChunk" type="number" default={50}>
|
|
156
|
-
Minimum character length for a chunk (default: 50)
|
|
157
|
-
</ParamField>
|
|
158
|
-
|
|
159
|
-
<ResponseField name="return" type="Promise<CallableRecursiveChunker>">
|
|
160
|
-
A callable chunker function that can be invoked with text
|
|
161
|
-
</ResponseField>
|
|
162
|
-
|
|
163
|
-
```typescript
|
|
164
|
-
import { ExuluChunkers } from "@exulu/backend";
|
|
165
|
-
|
|
166
|
-
// Create with default rules
|
|
167
|
-
const chunker = await ExuluChunkers.recursive.function.create({
|
|
168
|
-
chunkSize: 1024,
|
|
169
|
-
minCharactersPerChunk: 75
|
|
170
|
-
});
|
|
171
|
-
|
|
172
|
-
// Or with custom rules
|
|
173
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
174
|
-
levels: [
|
|
175
|
-
{ delimiters: ["\n\n"] },
|
|
176
|
-
{ delimiters: [". "] },
|
|
177
|
-
{ whitespace: true }
|
|
178
|
-
]
|
|
179
|
-
});
|
|
180
|
-
|
|
181
|
-
const customChunker = await ExuluChunkers.recursive.function.create({
|
|
182
|
-
chunkSize: 1024,
|
|
183
|
-
rules: rules,
|
|
184
|
-
minCharactersPerChunk: 50
|
|
185
|
-
});
|
|
186
|
-
```
|
|
187
|
-
|
|
188
|
-
### CallableRecursiveChunker
|
|
189
|
-
|
|
190
|
-
The chunker returned by `create()` is a callable function:
|
|
191
|
-
|
|
192
|
-
```typescript
|
|
193
|
-
async (text: string): Promise<RecursiveChunk[]>
|
|
194
|
-
```
|
|
195
|
-
|
|
196
|
-
<ParamField path="text" type="string" required>
|
|
197
|
-
The text to chunk
|
|
198
|
-
</ParamField>
|
|
199
|
-
|
|
200
|
-
<ResponseField name="return" type="Promise<RecursiveChunk[]>">
|
|
201
|
-
Array of RecursiveChunk objects
|
|
202
|
-
</ResponseField>
|
|
203
|
-
|
|
204
|
-
```typescript
|
|
205
|
-
const chunks = await chunker("Long text to chunk...");
|
|
206
|
-
|
|
207
|
-
for (const chunk of chunks) {
|
|
208
|
-
console.log(`Level ${chunk.level}: ${chunk.text}`);
|
|
209
|
-
console.log(`Tokens: ${chunk.tokenCount}`);
|
|
210
|
-
console.log(`Range: ${chunk.startIndex}-${chunk.endIndex}`);
|
|
211
|
-
}
|
|
212
|
-
```
|
|
213
|
-
|
|
214
|
-
### Properties
|
|
215
|
-
|
|
216
|
-
The callable chunker also has properties from the RecursiveChunker class:
|
|
217
|
-
|
|
218
|
-
<ResponseField name="chunkSize" type="number">
|
|
219
|
-
Maximum tokens per chunk
|
|
220
|
-
</ResponseField>
|
|
221
|
-
|
|
222
|
-
<ResponseField name="rules" type="RecursiveRules">
|
|
223
|
-
The recursive splitting rules
|
|
224
|
-
</ResponseField>
|
|
225
|
-
|
|
226
|
-
<ResponseField name="minCharactersPerChunk" type="number">
|
|
227
|
-
Minimum characters per chunk
|
|
228
|
-
</ResponseField>
|
|
229
|
-
|
|
230
|
-
<ResponseField name="tokenizer" type="ExuluTokenizer">
|
|
231
|
-
The tokenizer instance used for counting tokens
|
|
232
|
-
</ResponseField>
|
|
233
|
-
|
|
234
|
-
```typescript
|
|
235
|
-
console.log(chunker.chunkSize); // 1024
|
|
236
|
-
console.log(chunker.minCharactersPerChunk); // 75
|
|
237
|
-
console.log(chunker.rules.length); // Number of levels
|
|
238
|
-
```
|
|
239
|
-
|
|
240
|
-
## RecursiveRules
|
|
241
|
-
|
|
242
|
-
Class representing recursive chunking rules.
|
|
243
|
-
|
|
244
|
-
### Constructor
|
|
245
|
-
|
|
246
|
-
```typescript
|
|
247
|
-
new RecursiveRules(data?: RecursiveRulesData)
|
|
248
|
-
```
|
|
249
|
-
|
|
250
|
-
<ParamField path="data" type="RecursiveRulesData">
|
|
251
|
-
Configuration for recursive rules
|
|
252
|
-
</ParamField>
|
|
253
|
-
|
|
254
|
-
<ParamField path="data.levels" type="RecursiveLevelData[]">
|
|
255
|
-
Array of recursive levels defining the splitting hierarchy
|
|
256
|
-
</ParamField>
|
|
257
|
-
|
|
258
|
-
```typescript
|
|
259
|
-
import { ExuluChunkers } from "@exulu/backend";
|
|
260
|
-
|
|
261
|
-
// Create with default levels
|
|
262
|
-
const defaultRules = new ExuluChunkers.recursive.rules();
|
|
263
|
-
|
|
264
|
-
// Create with custom levels
|
|
265
|
-
const customRules = new ExuluChunkers.recursive.rules({
|
|
266
|
-
levels: [
|
|
267
|
-
{ delimiters: ["\n\n", "\n"] },
|
|
268
|
-
{ delimiters: [". ", "! ", "? "] },
|
|
269
|
-
{ whitespace: true }
|
|
270
|
-
]
|
|
271
|
-
});
|
|
272
|
-
```
|
|
273
|
-
|
|
274
|
-
**Default levels:**
|
|
275
|
-
1. Paragraphs: `["\n\n", "\r\n", "\n", "\r"]`
|
|
276
|
-
2. Sentences: `[". ", "! ", "? "]`
|
|
277
|
-
3. Pauses: `["{", "}", '"', "[", "]", "<", ">", "(", ")", ":", ";", ",", "—", "|", "~", "-", "...", "`", "'"]`
|
|
278
|
-
4. Words: `whitespace: true`
|
|
279
|
-
5. Tokens: No delimiters
|
|
280
|
-
|
|
281
|
-
### Properties
|
|
282
|
-
|
|
283
|
-
<ResponseField name="levels" type="RecursiveLevel[]">
|
|
284
|
-
Array of recursive levels
|
|
285
|
-
</ResponseField>
|
|
286
|
-
|
|
287
|
-
<ResponseField name="length" type="number" getter>
|
|
288
|
-
Number of levels in the rules
|
|
289
|
-
</ResponseField>
|
|
290
|
-
|
|
291
|
-
```typescript
|
|
292
|
-
const rules = new ExuluChunkers.recursive.rules();
|
|
293
|
-
|
|
294
|
-
console.log(rules.length); // 5 (default levels)
|
|
295
|
-
console.log(rules.levels[0]); // First level (paragraphs)
|
|
296
|
-
```
|
|
297
|
-
|
|
298
|
-
### Methods
|
|
299
|
-
|
|
300
|
-
#### getLevel()
|
|
301
|
-
|
|
302
|
-
Get a level by index.
|
|
303
|
-
|
|
304
|
-
```typescript
|
|
305
|
-
getLevel(index: number): RecursiveLevel | undefined
|
|
306
|
-
```
|
|
307
|
-
|
|
308
|
-
<ParamField path="index" type="number" required>
|
|
309
|
-
The index of the level to retrieve
|
|
310
|
-
</ParamField>
|
|
311
|
-
|
|
312
|
-
<ResponseField name="return" type="RecursiveLevel | undefined">
|
|
313
|
-
The level at the specified index, or undefined if not found
|
|
314
|
-
</ResponseField>
|
|
315
|
-
|
|
316
|
-
```typescript
|
|
317
|
-
const rules = new ExuluChunkers.recursive.rules();
|
|
318
|
-
|
|
319
|
-
const firstLevel = rules.getLevel(0); // Paragraphs level
|
|
320
|
-
const secondLevel = rules.getLevel(1); // Sentences level
|
|
321
|
-
const invalid = rules.getLevel(999); // undefined
|
|
322
|
-
```
|
|
323
|
-
|
|
324
|
-
#### toDict()
|
|
325
|
-
|
|
326
|
-
Convert rules to a dictionary-like object.
|
|
327
|
-
|
|
328
|
-
```typescript
|
|
329
|
-
toDict(): RecursiveRulesData
|
|
330
|
-
```
|
|
331
|
-
|
|
332
|
-
<ResponseField name="return" type="RecursiveRulesData">
|
|
333
|
-
Dictionary representation of the rules
|
|
334
|
-
</ResponseField>
|
|
335
|
-
|
|
336
|
-
```typescript
|
|
337
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
338
|
-
levels: [
|
|
339
|
-
{ delimiters: ["\n\n"] },
|
|
340
|
-
{ whitespace: true }
|
|
341
|
-
]
|
|
342
|
-
});
|
|
343
|
-
|
|
344
|
-
const dict = rules.toDict();
|
|
345
|
-
console.log(dict);
|
|
346
|
-
// { levels: [{ delimiters: ["\n\n"], whitespace: false, includeDelim: "prev" }, ...] }
|
|
347
|
-
```
|
|
348
|
-
|
|
349
|
-
#### fromDict()
|
|
350
|
-
|
|
351
|
-
Create RecursiveRules from a dictionary.
|
|
352
|
-
|
|
353
|
-
```typescript
|
|
354
|
-
static fromDict(data: RecursiveRulesData): RecursiveRules
|
|
355
|
-
```
|
|
356
|
-
|
|
357
|
-
<ParamField path="data" type="RecursiveRulesData" required>
|
|
358
|
-
Dictionary representation of rules
|
|
359
|
-
</ParamField>
|
|
360
|
-
|
|
361
|
-
<ResponseField name="return" type="RecursiveRules">
|
|
362
|
-
New RecursiveRules instance
|
|
363
|
-
</ResponseField>
|
|
364
|
-
|
|
365
|
-
```typescript
|
|
366
|
-
const data = {
|
|
367
|
-
levels: [
|
|
368
|
-
{ delimiters: ["\n\n"] },
|
|
369
|
-
{ whitespace: true }
|
|
370
|
-
]
|
|
371
|
-
};
|
|
372
|
-
|
|
373
|
-
const rules = ExuluChunkers.recursive.rules.fromDict(data);
|
|
374
|
-
```
|
|
375
|
-
|
|
376
|
-
#### toString()
|
|
377
|
-
|
|
378
|
-
String representation of the rules.
|
|
379
|
-
|
|
380
|
-
```typescript
|
|
381
|
-
toString(): string
|
|
382
|
-
```
|
|
383
|
-
|
|
384
|
-
<ResponseField name="return" type="string">
|
|
385
|
-
String representation
|
|
386
|
-
</ResponseField>
|
|
387
|
-
|
|
388
|
-
```typescript
|
|
389
|
-
const rules = new ExuluChunkers.recursive.rules();
|
|
390
|
-
console.log(rules.toString());
|
|
391
|
-
// "RecursiveRules(levels=[...])"
|
|
392
|
-
```
|
|
393
|
-
|
|
394
|
-
#### Symbol.iterator
|
|
395
|
-
|
|
396
|
-
The rules object is iterable:
|
|
397
|
-
|
|
398
|
-
```typescript
|
|
399
|
-
for (const level of rules) {
|
|
400
|
-
console.log(level.delimiters);
|
|
401
|
-
console.log(level.whitespace);
|
|
402
|
-
}
|
|
403
|
-
```
|
|
404
|
-
|
|
405
|
-
## RecursiveLevel
|
|
406
|
-
|
|
407
|
-
Class representing a single level in the recursive hierarchy.
|
|
408
|
-
|
|
409
|
-
### Constructor
|
|
410
|
-
|
|
411
|
-
```typescript
|
|
412
|
-
new RecursiveLevel(data?: RecursiveLevelData)
|
|
413
|
-
```
|
|
414
|
-
|
|
415
|
-
<ParamField path="data" type="RecursiveLevelData">
|
|
416
|
-
Configuration for the level
|
|
417
|
-
</ParamField>
|
|
418
|
-
|
|
419
|
-
<ParamField path="data.delimiters" type="string | string[]">
|
|
420
|
-
Delimiter(s) to use for splitting at this level
|
|
421
|
-
</ParamField>
|
|
422
|
-
|
|
423
|
-
<ParamField path="data.whitespace" type="boolean" default={false}>
|
|
424
|
-
Whether to split on whitespace (default: false)
|
|
425
|
-
</ParamField>
|
|
426
|
-
|
|
427
|
-
<ParamField path="data.includeDelim" type="'prev' | 'next'" default="prev">
|
|
428
|
-
Whether to include delimiter in previous or next chunk (default: "prev")
|
|
429
|
-
</ParamField>
|
|
430
|
-
|
|
431
|
-
```typescript
|
|
432
|
-
// Single delimiter
|
|
433
|
-
const level1 = new RecursiveLevel({
|
|
434
|
-
delimiters: "\n\n"
|
|
435
|
-
});
|
|
436
|
-
|
|
437
|
-
// Multiple delimiters
|
|
438
|
-
const level2 = new RecursiveLevel({
|
|
439
|
-
delimiters: [". ", "! ", "? "],
|
|
440
|
-
includeDelim: "prev"
|
|
441
|
-
});
|
|
442
|
-
|
|
443
|
-
// Whitespace splitting
|
|
444
|
-
const level3 = new RecursiveLevel({
|
|
445
|
-
whitespace: true
|
|
446
|
-
});
|
|
447
|
-
|
|
448
|
-
// No delimiters (token-level fallback)
|
|
449
|
-
const level4 = new RecursiveLevel();
|
|
450
|
-
```
|
|
451
|
-
|
|
452
|
-
<Warning>
|
|
453
|
-
Cannot use both `delimiters` and `whitespace` in the same level. They are mutually exclusive.
|
|
454
|
-
</Warning>
|
|
455
|
-
|
|
456
|
-
### Properties
|
|
457
|
-
|
|
458
|
-
<ResponseField name="delimiters" type="string | string[] | undefined">
|
|
459
|
-
Custom delimiters for chunking
|
|
460
|
-
</ResponseField>
|
|
461
|
-
|
|
462
|
-
<ResponseField name="whitespace" type="boolean">
|
|
463
|
-
Whether to use whitespace as a delimiter
|
|
464
|
-
</ResponseField>
|
|
465
|
-
|
|
466
|
-
<ResponseField name="includeDelim" type="'prev' | 'next'">
|
|
467
|
-
Where to include the delimiter
|
|
468
|
-
</ResponseField>
|
|
469
|
-
|
|
470
|
-
```typescript
|
|
471
|
-
const level = new RecursiveLevel({
|
|
472
|
-
delimiters: [". ", "! ", "? "],
|
|
473
|
-
includeDelim: "prev"
|
|
474
|
-
});
|
|
475
|
-
|
|
476
|
-
console.log(level.delimiters); // [". ", "! ", "? "]
|
|
477
|
-
console.log(level.whitespace); // false
|
|
478
|
-
console.log(level.includeDelim); // "prev"
|
|
479
|
-
```
|
|
480
|
-
|
|
481
|
-
### Methods
|
|
482
|
-
|
|
483
|
-
#### toDict()
|
|
484
|
-
|
|
485
|
-
Convert level to dictionary.
|
|
486
|
-
|
|
487
|
-
```typescript
|
|
488
|
-
toDict(): RecursiveLevelData
|
|
489
|
-
```
|
|
490
|
-
|
|
491
|
-
<ResponseField name="return" type="RecursiveLevelData">
|
|
492
|
-
Dictionary representation
|
|
493
|
-
</ResponseField>
|
|
494
|
-
|
|
495
|
-
```typescript
|
|
496
|
-
const level = new RecursiveLevel({ delimiters: [". "] });
|
|
497
|
-
const dict = level.toDict();
|
|
498
|
-
console.log(dict);
|
|
499
|
-
// { delimiters: [". "], whitespace: false, includeDelim: "prev" }
|
|
500
|
-
```
|
|
501
|
-
|
|
502
|
-
#### fromDict()
|
|
503
|
-
|
|
504
|
-
Create RecursiveLevel from dictionary.
|
|
505
|
-
|
|
506
|
-
```typescript
|
|
507
|
-
static fromDict(data: RecursiveLevelData): RecursiveLevel
|
|
508
|
-
```
|
|
509
|
-
|
|
510
|
-
<ParamField path="data" type="RecursiveLevelData" required>
|
|
511
|
-
Dictionary representation
|
|
512
|
-
</ParamField>
|
|
513
|
-
|
|
514
|
-
<ResponseField name="return" type="RecursiveLevel">
|
|
515
|
-
New RecursiveLevel instance
|
|
516
|
-
</ResponseField>
|
|
517
|
-
|
|
518
|
-
```typescript
|
|
519
|
-
const data = { delimiters: [". "], includeDelim: "next" };
|
|
520
|
-
const level = RecursiveLevel.fromDict(data);
|
|
521
|
-
```
|
|
522
|
-
|
|
523
|
-
#### toString()
|
|
524
|
-
|
|
525
|
-
String representation of the level.
|
|
526
|
-
|
|
527
|
-
```typescript
|
|
528
|
-
toString(): string
|
|
529
|
-
```
|
|
530
|
-
|
|
531
|
-
<ResponseField name="return" type="string">
|
|
532
|
-
String representation
|
|
533
|
-
</ResponseField>
|
|
534
|
-
|
|
535
|
-
```typescript
|
|
536
|
-
const level = new RecursiveLevel({ delimiters: [". "] });
|
|
537
|
-
console.log(level.toString());
|
|
538
|
-
// "RecursiveLevel(delimiters=["."], whitespace=false, includeDelim=prev)"
|
|
539
|
-
```
|
|
540
|
-
|
|
541
|
-
## Chunk
|
|
542
|
-
|
|
543
|
-
Base class for text chunks.
|
|
544
|
-
|
|
545
|
-
### Properties
|
|
546
|
-
|
|
547
|
-
<ResponseField name="text" type="string">
|
|
548
|
-
The chunk text
|
|
549
|
-
</ResponseField>
|
|
550
|
-
|
|
551
|
-
<ResponseField name="startIndex" type="number">
|
|
552
|
-
Starting index in the original text
|
|
553
|
-
</ResponseField>
|
|
554
|
-
|
|
555
|
-
<ResponseField name="endIndex" type="number">
|
|
556
|
-
Ending index in the original text
|
|
557
|
-
</ResponseField>
|
|
558
|
-
|
|
559
|
-
<ResponseField name="tokenCount" type="number">
|
|
560
|
-
Number of tokens in the chunk
|
|
561
|
-
</ResponseField>
|
|
562
|
-
|
|
563
|
-
<ResponseField name="embedding" type="number[] | undefined">
|
|
564
|
-
Optional embedding vector for the chunk
|
|
565
|
-
</ResponseField>
|
|
566
|
-
|
|
567
|
-
```typescript
|
|
568
|
-
const chunk = chunks[0];
|
|
569
|
-
|
|
570
|
-
console.log(chunk.text); // "This is the first chunk..."
|
|
571
|
-
console.log(chunk.startIndex); // 0
|
|
572
|
-
console.log(chunk.endIndex); // 245
|
|
573
|
-
console.log(chunk.tokenCount); // 48
|
|
574
|
-
console.log(chunk.embedding); // undefined (or embedding array)
|
|
575
|
-
```
|
|
576
|
-
|
|
577
|
-
### Methods
|
|
578
|
-
|
|
579
|
-
#### toString()
|
|
580
|
-
|
|
581
|
-
String representation of the chunk (returns the text).
|
|
582
|
-
|
|
583
|
-
```typescript
|
|
584
|
-
toString(): string
|
|
585
|
-
```
|
|
586
|
-
|
|
587
|
-
<ResponseField name="return" type="string">
|
|
588
|
-
The chunk text
|
|
589
|
-
</ResponseField>
|
|
590
|
-
|
|
591
|
-
```typescript
|
|
592
|
-
console.log(chunk.toString()); // "This is the first chunk..."
|
|
593
|
-
```
|
|
594
|
-
|
|
595
|
-
#### toRepresentation()
|
|
596
|
-
|
|
597
|
-
Detailed string representation.
|
|
598
|
-
|
|
599
|
-
```typescript
|
|
600
|
-
toRepresentation(): string
|
|
601
|
-
```
|
|
602
|
-
|
|
603
|
-
<ResponseField name="return" type="string">
|
|
604
|
-
Detailed representation
|
|
605
|
-
</ResponseField>
|
|
606
|
-
|
|
607
|
-
```typescript
|
|
608
|
-
console.log(chunk.toRepresentation());
|
|
609
|
-
// "Chunk(text='...', tokenCount=48, startIndex=0, endIndex=245)"
|
|
610
|
-
```
|
|
611
|
-
|
|
612
|
-
#### slice()
|
|
613
|
-
|
|
614
|
-
Get a slice of the chunk's text.
|
|
615
|
-
|
|
616
|
-
```typescript
|
|
617
|
-
slice(start?: number, end?: number): string
|
|
618
|
-
```
|
|
619
|
-
|
|
620
|
-
<ParamField path="start" type="number">
|
|
621
|
-
Starting index for the slice
|
|
622
|
-
</ParamField>
|
|
623
|
-
|
|
624
|
-
<ParamField path="end" type="number">
|
|
625
|
-
Ending index for the slice
|
|
626
|
-
</ParamField>
|
|
627
|
-
|
|
628
|
-
<ResponseField name="return" type="string">
|
|
629
|
-
Sliced text
|
|
630
|
-
</ResponseField>
|
|
631
|
-
|
|
632
|
-
```typescript
|
|
633
|
-
const chunk = chunks[0];
|
|
634
|
-
console.log(chunk.slice(0, 50)); // First 50 characters
|
|
635
|
-
```
|
|
636
|
-
|
|
637
|
-
#### toDict()
|
|
638
|
-
|
|
639
|
-
Convert chunk to dictionary.
|
|
640
|
-
|
|
641
|
-
```typescript
|
|
642
|
-
toDict(): ChunkData
|
|
643
|
-
```
|
|
644
|
-
|
|
645
|
-
<ResponseField name="return" type="ChunkData">
|
|
646
|
-
Dictionary representation
|
|
647
|
-
</ResponseField>
|
|
648
|
-
|
|
649
|
-
```typescript
|
|
650
|
-
const dict = chunk.toDict();
|
|
651
|
-
console.log(dict);
|
|
652
|
-
// { text: "...", startIndex: 0, endIndex: 245, tokenCount: 48, embedding: undefined }
|
|
653
|
-
```
|
|
654
|
-
|
|
655
|
-
#### fromDict()
|
|
656
|
-
|
|
657
|
-
Create Chunk from dictionary.
|
|
658
|
-
|
|
659
|
-
```typescript
|
|
660
|
-
static fromDict(data: ChunkData): Chunk
|
|
661
|
-
```
|
|
662
|
-
|
|
663
|
-
<ParamField path="data" type="ChunkData" required>
|
|
664
|
-
Dictionary representation
|
|
665
|
-
</ParamField>
|
|
666
|
-
|
|
667
|
-
<ResponseField name="return" type="Chunk">
|
|
668
|
-
New Chunk instance
|
|
669
|
-
</ResponseField>
|
|
670
|
-
|
|
671
|
-
```typescript
|
|
672
|
-
const data = {
|
|
673
|
-
text: "Sample text",
|
|
674
|
-
startIndex: 0,
|
|
675
|
-
endIndex: 11,
|
|
676
|
-
tokenCount: 3
|
|
677
|
-
};
|
|
678
|
-
|
|
679
|
-
const chunk = Chunk.fromDict(data);
|
|
680
|
-
```
|
|
681
|
-
|
|
682
|
-
#### copy()
|
|
683
|
-
|
|
684
|
-
Create a deep copy of the chunk.
|
|
685
|
-
|
|
686
|
-
```typescript
|
|
687
|
-
copy(): Chunk
|
|
688
|
-
```
|
|
689
|
-
|
|
690
|
-
<ResponseField name="return" type="Chunk">
|
|
691
|
-
Deep copy of the chunk
|
|
692
|
-
</ResponseField>
|
|
693
|
-
|
|
694
|
-
```typescript
|
|
695
|
-
const original = chunks[0];
|
|
696
|
-
const copy = original.copy();
|
|
697
|
-
|
|
698
|
-
console.log(copy.text === original.text); // true
|
|
699
|
-
console.log(copy === original); // false (different objects)
|
|
700
|
-
```
|
|
701
|
-
|
|
702
|
-
## RecursiveChunk
|
|
703
|
-
|
|
704
|
-
Extends `Chunk` with recursion level tracking.
|
|
705
|
-
|
|
706
|
-
### Properties
|
|
707
|
-
|
|
708
|
-
All properties from `Chunk`, plus:
|
|
709
|
-
|
|
710
|
-
<ResponseField name="level" type="number | undefined">
|
|
711
|
-
The recursion level at which this chunk was created
|
|
712
|
-
</ResponseField>
|
|
713
|
-
|
|
714
|
-
```typescript
|
|
715
|
-
const chunk = chunks[0];
|
|
716
|
-
|
|
717
|
-
console.log(chunk.text); // "This is the first chunk..."
|
|
718
|
-
console.log(chunk.tokenCount); // 48
|
|
719
|
-
console.log(chunk.level); // 0 (split at top level)
|
|
720
|
-
```
|
|
721
|
-
|
|
722
|
-
**Level interpretation:**
|
|
723
|
-
- `0`: Split at first level (e.g., paragraphs)
|
|
724
|
-
- `1`: Split at second level (e.g., sentences)
|
|
725
|
-
- `2`: Split at third level (e.g., pauses)
|
|
726
|
-
- etc.
|
|
727
|
-
|
|
728
|
-
### Methods
|
|
729
|
-
|
|
730
|
-
All methods from `Chunk`, with overridden implementations that preserve the `level` property.
|
|
731
|
-
|
|
732
|
-
## Usage examples
|
|
733
|
-
|
|
734
|
-
### Basic sentence chunking
|
|
735
|
-
|
|
736
|
-
```typescript
|
|
737
|
-
import { ExuluChunkers } from "@exulu/backend";
|
|
738
|
-
|
|
739
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
740
|
-
chunkSize: 512,
|
|
741
|
-
chunkOverlap: 50
|
|
742
|
-
});
|
|
743
|
-
|
|
744
|
-
const text = `
|
|
745
|
-
Artificial intelligence is transforming industries worldwide.
|
|
746
|
-
Machine learning enables computers to learn from data without
|
|
747
|
-
explicit programming. Deep learning uses neural networks to
|
|
748
|
-
recognize complex patterns in images, text, and audio.
|
|
749
|
-
|
|
750
|
-
The field continues to evolve rapidly. New techniques emerge
|
|
751
|
-
regularly, pushing the boundaries of what's possible.
|
|
752
|
-
`;
|
|
753
|
-
|
|
754
|
-
const chunks = await chunker(text);
|
|
755
|
-
|
|
756
|
-
console.log(`Created ${chunks.length} chunks`);
|
|
757
|
-
|
|
758
|
-
for (const [i, chunk] of chunks.entries()) {
|
|
759
|
-
console.log(`\nChunk ${i + 1}:`);
|
|
760
|
-
console.log(` Text: ${chunk.text.slice(0, 50)}...`);
|
|
761
|
-
console.log(` Tokens: ${chunk.tokenCount}`);
|
|
762
|
-
console.log(` Range: ${chunk.startIndex}-${chunk.endIndex}`);
|
|
763
|
-
}
|
|
764
|
-
```
|
|
765
|
-
|
|
766
|
-
### Recursive chunking with custom rules
|
|
767
|
-
|
|
768
|
-
```typescript
|
|
769
|
-
import { ExuluChunkers } from "@exulu/backend";
|
|
770
|
-
|
|
771
|
-
// Define custom rules for markdown
|
|
772
|
-
const rules = new ExuluChunkers.recursive.rules({
|
|
773
|
-
levels: [
|
|
774
|
-
// Split by headers (keep header with content)
|
|
775
|
-
{
|
|
776
|
-
delimiters: ["\n## ", "\n### "],
|
|
777
|
-
includeDelim: "next"
|
|
778
|
-
},
|
|
779
|
-
// Split by paragraphs
|
|
780
|
-
{ delimiters: ["\n\n"] },
|
|
781
|
-
// Split by sentences
|
|
782
|
-
{ delimiters: [". ", "! ", "? "] },
|
|
783
|
-
// Split by words
|
|
784
|
-
{ whitespace: true }
|
|
785
|
-
]
|
|
786
|
-
});
|
|
787
|
-
|
|
788
|
-
const chunker = await ExuluChunkers.recursive.function.create({
|
|
789
|
-
chunkSize: 1024,
|
|
790
|
-
rules: rules,
|
|
791
|
-
minCharactersPerChunk: 75
|
|
792
|
-
});
|
|
793
|
-
|
|
794
|
-
const markdown = `
|
|
795
|
-
## Introduction
|
|
796
|
-
|
|
797
|
-
Machine learning is a subset of artificial intelligence.
|
|
798
|
-
It enables systems to learn and improve from experience.
|
|
799
|
-
|
|
800
|
-
## Applications
|
|
801
|
-
|
|
802
|
-
Recommendation systems use ML to personalize content.
|
|
803
|
-
Fraud detection systems identify suspicious patterns.
|
|
804
|
-
Autonomous vehicles use ML for navigation and decision-making.
|
|
805
|
-
|
|
806
|
-
## Future Directions
|
|
807
|
-
|
|
808
|
-
The field continues to advance rapidly.
|
|
809
|
-
New architectures and techniques emerge regularly.
|
|
810
|
-
`;
|
|
811
|
-
|
|
812
|
-
const chunks = await chunker(markdown);
|
|
813
|
-
|
|
814
|
-
console.log(`Created ${chunks.length} chunks`);
|
|
815
|
-
|
|
816
|
-
for (const [i, chunk] of chunks.entries()) {
|
|
817
|
-
console.log(`\nChunk ${i + 1} (level ${chunk.level}):`);
|
|
818
|
-
console.log(` Text: ${chunk.text}`);
|
|
819
|
-
console.log(` Tokens: ${chunk.tokenCount}`);
|
|
820
|
-
}
|
|
821
|
-
```
|
|
822
|
-
|
|
823
|
-
### Analyzing chunk statistics
|
|
824
|
-
|
|
825
|
-
```typescript
|
|
826
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
827
|
-
chunkSize: 512,
|
|
828
|
-
chunkOverlap: 50
|
|
829
|
-
});
|
|
830
|
-
|
|
831
|
-
const text = "Your long document...";
|
|
832
|
-
const chunks = await chunker(text);
|
|
833
|
-
|
|
834
|
-
// Calculate statistics
|
|
835
|
-
const tokenCounts = chunks.map(c => c.tokenCount);
|
|
836
|
-
const avgTokens = tokenCounts.reduce((a, b) => a + b, 0) / chunks.length;
|
|
837
|
-
const maxTokens = Math.max(...tokenCounts);
|
|
838
|
-
const minTokens = Math.min(...tokenCounts);
|
|
839
|
-
|
|
840
|
-
console.log(`Chunks: ${chunks.length}`);
|
|
841
|
-
console.log(`Avg tokens: ${avgTokens.toFixed(2)}`);
|
|
842
|
-
console.log(`Max tokens: ${maxTokens}`);
|
|
843
|
-
console.log(`Min tokens: ${minTokens}`);
|
|
844
|
-
console.log(`Total tokens: ${tokenCounts.reduce((a, b) => a + b, 0)}`);
|
|
845
|
-
|
|
846
|
-
// Distribution
|
|
847
|
-
const histogram = {};
|
|
848
|
-
for (const chunk of chunks) {
|
|
849
|
-
const bucket = Math.floor(chunk.tokenCount / 100) * 100;
|
|
850
|
-
histogram[bucket] = (histogram[bucket] || 0) + 1;
|
|
851
|
-
}
|
|
852
|
-
|
|
853
|
-
console.log("\nToken distribution:");
|
|
854
|
-
for (const [bucket, count] of Object.entries(histogram)) {
|
|
855
|
-
console.log(` ${bucket}-${parseInt(bucket) + 99}: ${'*'.repeat(count)}`);
|
|
856
|
-
}
|
|
857
|
-
```
|
|
858
|
-
|
|
859
|
-
### Inspecting level distribution (recursive)
|
|
860
|
-
|
|
861
|
-
```typescript
|
|
862
|
-
const chunker = await ExuluChunkers.recursive.function.create({
|
|
863
|
-
chunkSize: 1024
|
|
864
|
-
});
|
|
865
|
-
|
|
866
|
-
const text = "Your document...";
|
|
867
|
-
const chunks = await chunker(text);
|
|
868
|
-
|
|
869
|
-
// Count chunks by level
|
|
870
|
-
const levelCounts = {};
|
|
871
|
-
for (const chunk of chunks) {
|
|
872
|
-
levelCounts[chunk.level || 0] = (levelCounts[chunk.level || 0] || 0) + 1;
|
|
873
|
-
}
|
|
874
|
-
|
|
875
|
-
console.log("Chunk distribution by level:");
|
|
876
|
-
for (const [level, count] of Object.entries(levelCounts)) {
|
|
877
|
-
const levelName = ["Paragraphs", "Sentences", "Pauses", "Words", "Tokens"][level];
|
|
878
|
-
console.log(` Level ${level} (${levelName}): ${count} chunks`);
|
|
879
|
-
}
|
|
880
|
-
```
|
|
881
|
-
|
|
882
|
-
### Using with ExuluContext
|
|
883
|
-
|
|
884
|
-
```typescript
|
|
885
|
-
import { ExuluContext, ExuluChunkers, ExuluEmbedder } from "@exulu/backend";
|
|
886
|
-
|
|
887
|
-
// Create chunker
|
|
888
|
-
const chunker = await ExuluChunkers.sentence.create({
|
|
889
|
-
chunkSize: 512,
|
|
890
|
-
chunkOverlap: 75
|
|
891
|
-
});
|
|
892
|
-
|
|
893
|
-
// Create embedder
|
|
894
|
-
const embedder = new ExuluEmbedder({
|
|
895
|
-
id: "openai_embedder",
|
|
896
|
-
name: "OpenAI Embeddings",
|
|
897
|
-
provider: "openai",
|
|
898
|
-
model: "text-embedding-3-small",
|
|
899
|
-
vectorDimensions: 1536
|
|
900
|
-
});
|
|
901
|
-
|
|
902
|
-
// Create context with chunker
|
|
903
|
-
const context = new ExuluContext({
|
|
904
|
-
id: "documentation",
|
|
905
|
-
name: "Product Documentation",
|
|
906
|
-
description: "Searchable product documentation",
|
|
907
|
-
embedder: embedder,
|
|
908
|
-
chunker: chunker, // Documents will be chunked automatically
|
|
909
|
-
fields: [
|
|
910
|
-
{ name: "title", type: "text", required: true },
|
|
911
|
-
{ name: "content", type: "longtext", required: true },
|
|
912
|
-
{ name: "url", type: "text", required: false }
|
|
913
|
-
],
|
|
914
|
-
sources: []
|
|
915
|
-
});
|
|
916
|
-
|
|
917
|
-
// Add document - it's automatically chunked and embedded
|
|
918
|
-
await context.createItem(
|
|
919
|
-
{
|
|
920
|
-
title: "Getting Started Guide",
|
|
921
|
-
content: "Very long documentation content...",
|
|
922
|
-
url: "https://example.com/docs/getting-started"
|
|
923
|
-
},
|
|
924
|
-
{ generateEmbeddings: true }
|
|
925
|
-
);
|
|
926
|
-
|
|
927
|
-
// Search returns relevant chunks
|
|
928
|
-
const results = await context.search({
|
|
929
|
-
query: "How do I install?",
|
|
930
|
-
limit: 5
|
|
931
|
-
});
|
|
932
|
-
|
|
933
|
-
for (const result of results) {
|
|
934
|
-
console.log(`Score: ${result.score}`);
|
|
935
|
-
console.log(`Chunk: ${result.chunk.text.slice(0, 100)}...`);
|
|
936
|
-
}
|
|
937
|
-
```
|
|
938
|
-
|
|
939
|
-
## Type definitions
|
|
940
|
-
|
|
941
|
-
```typescript
|
|
942
|
-
// Sentence chunker options
|
|
943
|
-
interface SentenceChunkerOptions {
|
|
944
|
-
chunkSize: number;
|
|
945
|
-
chunkOverlap?: number;
|
|
946
|
-
minSentencesPerChunk?: number;
|
|
947
|
-
minCharactersPerSentence?: number;
|
|
948
|
-
}
|
|
949
|
-
|
|
950
|
-
// Recursive chunker options
|
|
951
|
-
interface RecursiveChunkerOptions {
|
|
952
|
-
chunkSize: number;
|
|
953
|
-
rules?: RecursiveRules;
|
|
954
|
-
minCharactersPerChunk?: number;
|
|
955
|
-
}
|
|
956
|
-
|
|
957
|
-
// Recursive rules data
|
|
958
|
-
interface RecursiveRulesData {
|
|
959
|
-
levels?: RecursiveLevelData[];
|
|
960
|
-
}
|
|
961
|
-
|
|
962
|
-
// Recursive level data
|
|
963
|
-
interface RecursiveLevelData {
|
|
964
|
-
delimiters?: string | string[];
|
|
965
|
-
whitespace?: boolean;
|
|
966
|
-
includeDelim?: "prev" | "next";
|
|
967
|
-
}
|
|
968
|
-
|
|
969
|
-
// Chunk data
|
|
970
|
-
interface ChunkData {
|
|
971
|
-
text: string;
|
|
972
|
-
startIndex: number;
|
|
973
|
-
endIndex: number;
|
|
974
|
-
tokenCount: number;
|
|
975
|
-
embedding?: number[];
|
|
976
|
-
}
|
|
977
|
-
|
|
978
|
-
// Recursive chunk data
|
|
979
|
-
interface RecursiveChunkData extends ChunkData {
|
|
980
|
-
level?: number;
|
|
981
|
-
}
|
|
982
|
-
```
|
|
983
|
-
|
|
984
|
-
## Best practices
|
|
985
|
-
|
|
986
|
-
<Tip>
|
|
987
|
-
**Use appropriate chunk size**: Match your embedding model's token limit. Leave 10-20% headroom for metadata.
|
|
988
|
-
</Tip>
|
|
989
|
-
|
|
990
|
-
<Note>
|
|
991
|
-
**Enable overlap for natural language**: Use 10-20% overlap to preserve context at chunk boundaries.
|
|
992
|
-
</Note>
|
|
993
|
-
|
|
994
|
-
<Warning>
|
|
995
|
-
**Monitor chunk count**: More chunks = higher embedding costs. Balance granularity with cost.
|
|
996
|
-
</Warning>
|
|
997
|
-
|
|
998
|
-
<Info>
|
|
999
|
-
**Choose the right chunker**: SentenceChunker for most text, RecursiveChunker for structured documents.
|
|
1000
|
-
</Info>
|
|
1001
|
-
|
|
1002
|
-
## Next steps
|
|
1003
|
-
|
|
1004
|
-
<CardGroup cols={2}>
|
|
1005
|
-
<Card title="Configuration guide" icon="gear" href="/core/exulu-chunkers/configuration">
|
|
1006
|
-
Learn about configuration options
|
|
1007
|
-
</Card>
|
|
1008
|
-
<Card title="Overview" icon="book" href="/core/exulu-chunkers/introduction">
|
|
1009
|
-
Understand chunking concepts
|
|
1010
|
-
</Card>
|
|
1011
|
-
</CardGroup>
|