@exulu/backend 1.48.2 → 1.49.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. package/dist/index.cjs +351 -42
  2. package/dist/index.d.cts +96 -1
  3. package/dist/index.d.ts +96 -1
  4. package/dist/index.js +340 -38
  5. package/ee/{markdown.ts → chunking/markdown.ts} +2 -2
  6. package/ee/python/README.md +295 -0
  7. package/ee/python/documents/processing/README.md +155 -0
  8. package/ee/{documents → python/documents}/processing/doc_processor.ts +25 -17
  9. package/ee/{documents/processing/pdf_to_markdown.py → python/documents/processing/document_to_markdown.py} +3 -10
  10. package/ee/python/setup.sh +180 -0
  11. package/package.json +14 -3
  12. package/scripts/postinstall.cjs +149 -0
  13. package/.agents/skills/mintlify/SKILL.md +0 -347
  14. package/.editorconfig +0 -15
  15. package/.eslintrc.json +0 -52
  16. package/.github/workflows/release-backend.yml +0 -38
  17. package/.husky/commit-msg +0 -1
  18. package/.jscpd.json +0 -18
  19. package/.mcp.json +0 -25
  20. package/.nvmrc +0 -1
  21. package/.prettierignore +0 -5
  22. package/.prettierrc.json +0 -12
  23. package/CHANGELOG.md +0 -8
  24. package/SECURITY.md +0 -5
  25. package/commitlint.config.js +0 -4
  26. package/devops/documentation/patch-older-releases.md +0 -42
  27. package/ee/documents/processing/build_pdf_processor.sh +0 -35
  28. package/ee/documents/processing/chunk_markdown.py +0 -263
  29. package/ee/documents/processing/pdf_processor.spec +0 -115
  30. package/eslint.config.js +0 -88
  31. package/jest.config.ts +0 -25
  32. package/mintlify-docs/.mintignore +0 -7
  33. package/mintlify-docs/AGENTS.md +0 -33
  34. package/mintlify-docs/CLAUDE.MD +0 -50
  35. package/mintlify-docs/CONTRIBUTING.md +0 -32
  36. package/mintlify-docs/LICENSE +0 -21
  37. package/mintlify-docs/README.md +0 -55
  38. package/mintlify-docs/ai-tools/claude-code.mdx +0 -43
  39. package/mintlify-docs/ai-tools/cursor.mdx +0 -39
  40. package/mintlify-docs/ai-tools/windsurf.mdx +0 -39
  41. package/mintlify-docs/api-reference/core-types/agent-types.mdx +0 -110
  42. package/mintlify-docs/api-reference/core-types/analytics-types.mdx +0 -95
  43. package/mintlify-docs/api-reference/core-types/configuration-types.mdx +0 -83
  44. package/mintlify-docs/api-reference/core-types/evaluation-types.mdx +0 -106
  45. package/mintlify-docs/api-reference/core-types/job-types.mdx +0 -135
  46. package/mintlify-docs/api-reference/core-types/overview.mdx +0 -73
  47. package/mintlify-docs/api-reference/core-types/prompt-types.mdx +0 -102
  48. package/mintlify-docs/api-reference/core-types/rbac-types.mdx +0 -163
  49. package/mintlify-docs/api-reference/core-types/session-types.mdx +0 -77
  50. package/mintlify-docs/api-reference/core-types/user-management.mdx +0 -112
  51. package/mintlify-docs/api-reference/core-types/workflow-types.mdx +0 -88
  52. package/mintlify-docs/api-reference/core-types.mdx +0 -585
  53. package/mintlify-docs/api-reference/dynamic-types.mdx +0 -851
  54. package/mintlify-docs/api-reference/endpoint/create.mdx +0 -4
  55. package/mintlify-docs/api-reference/endpoint/delete.mdx +0 -4
  56. package/mintlify-docs/api-reference/endpoint/get.mdx +0 -4
  57. package/mintlify-docs/api-reference/endpoint/webhook.mdx +0 -4
  58. package/mintlify-docs/api-reference/introduction.mdx +0 -661
  59. package/mintlify-docs/api-reference/mutations.mdx +0 -1012
  60. package/mintlify-docs/api-reference/openapi.json +0 -217
  61. package/mintlify-docs/api-reference/queries.mdx +0 -1154
  62. package/mintlify-docs/backend/introduction.mdx +0 -218
  63. package/mintlify-docs/changelog.mdx +0 -387
  64. package/mintlify-docs/community-edition.mdx +0 -304
  65. package/mintlify-docs/core/exulu-agent/api-reference.mdx +0 -894
  66. package/mintlify-docs/core/exulu-agent/configuration.mdx +0 -690
  67. package/mintlify-docs/core/exulu-agent/introduction.mdx +0 -552
  68. package/mintlify-docs/core/exulu-app/api-reference.mdx +0 -481
  69. package/mintlify-docs/core/exulu-app/configuration.mdx +0 -319
  70. package/mintlify-docs/core/exulu-app/introduction.mdx +0 -117
  71. package/mintlify-docs/core/exulu-authentication.mdx +0 -810
  72. package/mintlify-docs/core/exulu-chunkers/api-reference.mdx +0 -1011
  73. package/mintlify-docs/core/exulu-chunkers/configuration.mdx +0 -596
  74. package/mintlify-docs/core/exulu-chunkers/introduction.mdx +0 -403
  75. package/mintlify-docs/core/exulu-context/api-reference.mdx +0 -911
  76. package/mintlify-docs/core/exulu-context/configuration.mdx +0 -648
  77. package/mintlify-docs/core/exulu-context/introduction.mdx +0 -394
  78. package/mintlify-docs/core/exulu-database.mdx +0 -811
  79. package/mintlify-docs/core/exulu-default-agents.mdx +0 -545
  80. package/mintlify-docs/core/exulu-eval/api-reference.mdx +0 -772
  81. package/mintlify-docs/core/exulu-eval/configuration.mdx +0 -680
  82. package/mintlify-docs/core/exulu-eval/introduction.mdx +0 -459
  83. package/mintlify-docs/core/exulu-logging.mdx +0 -464
  84. package/mintlify-docs/core/exulu-otel.mdx +0 -670
  85. package/mintlify-docs/core/exulu-queues/api-reference.mdx +0 -648
  86. package/mintlify-docs/core/exulu-queues/configuration.mdx +0 -650
  87. package/mintlify-docs/core/exulu-queues/introduction.mdx +0 -474
  88. package/mintlify-docs/core/exulu-reranker/api-reference.mdx +0 -630
  89. package/mintlify-docs/core/exulu-reranker/configuration.mdx +0 -663
  90. package/mintlify-docs/core/exulu-reranker/introduction.mdx +0 -516
  91. package/mintlify-docs/core/exulu-tool/api-reference.mdx +0 -723
  92. package/mintlify-docs/core/exulu-tool/configuration.mdx +0 -805
  93. package/mintlify-docs/core/exulu-tool/introduction.mdx +0 -539
  94. package/mintlify-docs/core/exulu-variables/api-reference.mdx +0 -699
  95. package/mintlify-docs/core/exulu-variables/configuration.mdx +0 -736
  96. package/mintlify-docs/core/exulu-variables/introduction.mdx +0 -511
  97. package/mintlify-docs/development.mdx +0 -94
  98. package/mintlify-docs/docs.json +0 -248
  99. package/mintlify-docs/enterprise-edition.mdx +0 -538
  100. package/mintlify-docs/essentials/code.mdx +0 -35
  101. package/mintlify-docs/essentials/images.mdx +0 -59
  102. package/mintlify-docs/essentials/markdown.mdx +0 -88
  103. package/mintlify-docs/essentials/navigation.mdx +0 -87
  104. package/mintlify-docs/essentials/reusable-snippets.mdx +0 -110
  105. package/mintlify-docs/essentials/settings.mdx +0 -318
  106. package/mintlify-docs/favicon.svg +0 -3
  107. package/mintlify-docs/frontend/introduction.mdx +0 -39
  108. package/mintlify-docs/getting-started.mdx +0 -267
  109. package/mintlify-docs/guides/custom-agent.mdx +0 -608
  110. package/mintlify-docs/guides/first-agent.mdx +0 -315
  111. package/mintlify-docs/images/admin_ui.png +0 -0
  112. package/mintlify-docs/images/contexts.png +0 -0
  113. package/mintlify-docs/images/create_agents.png +0 -0
  114. package/mintlify-docs/images/evals.png +0 -0
  115. package/mintlify-docs/images/graphql.png +0 -0
  116. package/mintlify-docs/images/graphql_api.png +0 -0
  117. package/mintlify-docs/images/hero-dark.png +0 -0
  118. package/mintlify-docs/images/hero-light.png +0 -0
  119. package/mintlify-docs/images/hero.png +0 -0
  120. package/mintlify-docs/images/knowledge_sources.png +0 -0
  121. package/mintlify-docs/images/mcp.png +0 -0
  122. package/mintlify-docs/images/scaling.png +0 -0
  123. package/mintlify-docs/index.mdx +0 -411
  124. package/mintlify-docs/logo/dark.svg +0 -9
  125. package/mintlify-docs/logo/light.svg +0 -9
  126. package/mintlify-docs/partners.mdx +0 -558
  127. package/mintlify-docs/products.mdx +0 -77
  128. package/mintlify-docs/snippets/snippet-intro.mdx +0 -4
  129. package/mintlify-docs/styles.css +0 -207
  130. package/ngrok.bash +0 -1
  131. package/ngrok.md +0 -6
  132. package/ngrok.yml +0 -10
  133. package/release.config.cjs +0 -15
  134. package/skills-lock.json +0 -10
  135. package/types/context-processor.ts +0 -45
  136. package/types/enums/eval-types.ts +0 -5
  137. package/types/enums/field-types.ts +0 -1
  138. package/types/enums/jobs.ts +0 -11
  139. package/types/enums/statistics.ts +0 -13
  140. package/types/exulu-table-definition.ts +0 -79
  141. package/types/file-types.ts +0 -18
  142. package/types/models/agent-session.ts +0 -27
  143. package/types/models/agent.ts +0 -68
  144. package/types/models/context.ts +0 -53
  145. package/types/models/embedding.ts +0 -17
  146. package/types/models/eval-run.ts +0 -40
  147. package/types/models/exulu-agent-tool-config.ts +0 -11
  148. package/types/models/item.ts +0 -21
  149. package/types/models/job.ts +0 -8
  150. package/types/models/project.ts +0 -16
  151. package/types/models/rate-limiter-rules.ts +0 -7
  152. package/types/models/test-case.ts +0 -25
  153. package/types/models/tool.ts +0 -9
  154. package/types/models/user-role.ts +0 -12
  155. package/types/models/user.ts +0 -20
  156. package/types/models/variable.ts +0 -8
  157. package/types/models/vector-methods.ts +0 -7
  158. package/types/provider-config.ts +0 -21
  159. package/types/queue-config.ts +0 -16
  160. package/types/rbac-rights-modes.ts +0 -1
  161. package/types/statistics.ts +0 -20
  162. package/types/workflow.ts +0 -31
  163. /package/ee/{documents → python/documents}/THIRD_PARTY_LICENSES/docling.txt +0 -0
  164. /package/ee/{documents/processing → python}/requirements.txt +0 -0
@@ -1,1011 +0,0 @@
1
- ---
2
- title: "API reference"
3
- description: "Complete method and property reference for ExuluChunkers"
4
- ---
5
-
6
- ## ExuluChunkers namespace
7
-
8
- ExuluChunkers is exported as a namespace object:
9
-
10
- ```typescript
11
- import { ExuluChunkers } from "@exulu/backend";
12
-
13
- // Access sentence chunker
14
- const sentenceChunker = await ExuluChunkers.sentence.create({...});
15
-
16
- // Access recursive chunker
17
- const recursiveChunker = await ExuluChunkers.recursive.function.create({...});
18
-
19
- // Access recursive rules
20
- const rules = new ExuluChunkers.recursive.rules({...});
21
- ```
22
-
23
- ## SentenceChunker
24
-
25
- ### create()
26
-
27
- Factory method to create a new SentenceChunker instance.
28
-
29
- ```typescript
30
- static async create(options: SentenceChunkerOptions): Promise<CallableSentenceChunker>
31
- ```
32
-
33
- <ParamField path="options" type="SentenceChunkerOptions" required>
34
- Configuration options for the chunker
35
- </ParamField>
36
-
37
- <ParamField path="options.chunkSize" type="number" required>
38
- Maximum number of tokens per chunk
39
- </ParamField>
40
-
41
- <ParamField path="options.chunkOverlap" type="number" default={0}>
42
- Number of tokens to overlap between chunks (default: 0)
43
- </ParamField>
44
-
45
- <ParamField path="options.minSentencesPerChunk" type="number" default={1}>
46
- Minimum sentences per chunk (default: 1)
47
- </ParamField>
48
-
49
- <ParamField path="options.minCharactersPerSentence" type="number" default={10}>
50
- Minimum character length for a sentence (default: 10)
51
- </ParamField>
52
-
53
- <ResponseField name="return" type="Promise<CallableSentenceChunker>">
54
- A callable chunker function that can be invoked with text
55
- </ResponseField>
56
-
57
- ```typescript
58
- import { ExuluChunkers } from "@exulu/backend";
59
-
60
- // Create chunker
61
- const chunker = await ExuluChunkers.sentence.create({
62
- chunkSize: 512,
63
- chunkOverlap: 50,
64
- minSentencesPerChunk: 2,
65
- minCharactersPerSentence: 15
66
- });
67
-
68
- // Use chunker
69
- const text = "Your document text here...";
70
- const chunks = await chunker(text);
71
-
72
- console.log(chunks.length); // Number of chunks
73
- console.log(chunks[0].text); // First chunk text
74
- console.log(chunks[0].tokenCount); // Token count
75
- ```
76
-
77
- ### CallableSentenceChunker
78
-
79
- The chunker returned by `create()` is a callable function:
80
-
81
- ```typescript
82
- async (text: string): Promise<Chunk[]>
83
- ```
84
-
85
- <ParamField path="text" type="string" required>
86
- The text to chunk
87
- </ParamField>
88
-
89
- <ResponseField name="return" type="Promise<Chunk[]>">
90
- Array of Chunk objects
91
- </ResponseField>
92
-
93
- ```typescript
94
- const chunks = await chunker("Long text to chunk...");
95
-
96
- for (const chunk of chunks) {
97
- console.log(chunk.text);
98
- console.log(chunk.tokenCount);
99
- console.log(chunk.startIndex, chunk.endIndex);
100
- }
101
- ```
102
-
103
- ### Properties
104
-
105
- The callable chunker also has properties from the SentenceChunker class:
106
-
107
- <ResponseField name="chunkSize" type="number">
108
- Maximum tokens per chunk
109
- </ResponseField>
110
-
111
- <ResponseField name="chunkOverlap" type="number">
112
- Overlap in tokens
113
- </ResponseField>
114
-
115
- <ResponseField name="minSentencesPerChunk" type="number">
116
- Minimum sentences per chunk
117
- </ResponseField>
118
-
119
- <ResponseField name="minCharactersPerSentence" type="number">
120
- Minimum characters per sentence
121
- </ResponseField>
122
-
123
- <ResponseField name="tokenizer" type="ExuluTokenizer">
124
- The tokenizer instance used for counting tokens
125
- </ResponseField>
126
-
127
- ```typescript
128
- console.log(chunker.chunkSize); // 512
129
- console.log(chunker.chunkOverlap); // 50
130
- console.log(chunker.minSentencesPerChunk); // 2
131
- ```
132
-
133
- ## RecursiveChunker
134
-
135
- ### create()
136
-
137
- Factory method to create a new RecursiveChunker instance.
138
-
139
- ```typescript
140
- static async create(options: RecursiveChunkerOptions): Promise<CallableRecursiveChunker>
141
- ```
142
-
143
- <ParamField path="options" type="RecursiveChunkerOptions" required>
144
- Configuration options for the chunker
145
- </ParamField>
146
-
147
- <ParamField path="options.chunkSize" type="number" required>
148
- Maximum number of tokens per chunk
149
- </ParamField>
150
-
151
- <ParamField path="options.rules" type="RecursiveRules" default="default rules">
152
- Recursive splitting rules (default: paragraphs → sentences → pauses → words → tokens)
153
- </ParamField>
154
-
155
- <ParamField path="options.minCharactersPerChunk" type="number" default={50}>
156
- Minimum character length for a chunk (default: 50)
157
- </ParamField>
158
-
159
- <ResponseField name="return" type="Promise<CallableRecursiveChunker>">
160
- A callable chunker function that can be invoked with text
161
- </ResponseField>
162
-
163
- ```typescript
164
- import { ExuluChunkers } from "@exulu/backend";
165
-
166
- // Create with default rules
167
- const chunker = await ExuluChunkers.recursive.function.create({
168
- chunkSize: 1024,
169
- minCharactersPerChunk: 75
170
- });
171
-
172
- // Or with custom rules
173
- const rules = new ExuluChunkers.recursive.rules({
174
- levels: [
175
- { delimiters: ["\n\n"] },
176
- { delimiters: [". "] },
177
- { whitespace: true }
178
- ]
179
- });
180
-
181
- const customChunker = await ExuluChunkers.recursive.function.create({
182
- chunkSize: 1024,
183
- rules: rules,
184
- minCharactersPerChunk: 50
185
- });
186
- ```
187
-
188
- ### CallableRecursiveChunker
189
-
190
- The chunker returned by `create()` is a callable function:
191
-
192
- ```typescript
193
- async (text: string): Promise<RecursiveChunk[]>
194
- ```
195
-
196
- <ParamField path="text" type="string" required>
197
- The text to chunk
198
- </ParamField>
199
-
200
- <ResponseField name="return" type="Promise<RecursiveChunk[]>">
201
- Array of RecursiveChunk objects
202
- </ResponseField>
203
-
204
- ```typescript
205
- const chunks = await chunker("Long text to chunk...");
206
-
207
- for (const chunk of chunks) {
208
- console.log(`Level ${chunk.level}: ${chunk.text}`);
209
- console.log(`Tokens: ${chunk.tokenCount}`);
210
- console.log(`Range: ${chunk.startIndex}-${chunk.endIndex}`);
211
- }
212
- ```
213
-
214
- ### Properties
215
-
216
- The callable chunker also has properties from the RecursiveChunker class:
217
-
218
- <ResponseField name="chunkSize" type="number">
219
- Maximum tokens per chunk
220
- </ResponseField>
221
-
222
- <ResponseField name="rules" type="RecursiveRules">
223
- The recursive splitting rules
224
- </ResponseField>
225
-
226
- <ResponseField name="minCharactersPerChunk" type="number">
227
- Minimum characters per chunk
228
- </ResponseField>
229
-
230
- <ResponseField name="tokenizer" type="ExuluTokenizer">
231
- The tokenizer instance used for counting tokens
232
- </ResponseField>
233
-
234
- ```typescript
235
- console.log(chunker.chunkSize); // 1024
236
- console.log(chunker.minCharactersPerChunk); // 75
237
- console.log(chunker.rules.length); // Number of levels
238
- ```
239
-
240
- ## RecursiveRules
241
-
242
- Class representing recursive chunking rules.
243
-
244
- ### Constructor
245
-
246
- ```typescript
247
- new RecursiveRules(data?: RecursiveRulesData)
248
- ```
249
-
250
- <ParamField path="data" type="RecursiveRulesData">
251
- Configuration for recursive rules
252
- </ParamField>
253
-
254
- <ParamField path="data.levels" type="RecursiveLevelData[]">
255
- Array of recursive levels defining the splitting hierarchy
256
- </ParamField>
257
-
258
- ```typescript
259
- import { ExuluChunkers } from "@exulu/backend";
260
-
261
- // Create with default levels
262
- const defaultRules = new ExuluChunkers.recursive.rules();
263
-
264
- // Create with custom levels
265
- const customRules = new ExuluChunkers.recursive.rules({
266
- levels: [
267
- { delimiters: ["\n\n", "\n"] },
268
- { delimiters: [". ", "! ", "? "] },
269
- { whitespace: true }
270
- ]
271
- });
272
- ```
273
-
274
- **Default levels:**
275
- 1. Paragraphs: `["\n\n", "\r\n", "\n", "\r"]`
276
- 2. Sentences: `[". ", "! ", "? "]`
277
- 3. Pauses: `["{", "}", '"', "[", "]", "<", ">", "(", ")", ":", ";", ",", "—", "|", "~", "-", "...", "`", "'"]`
278
- 4. Words: `whitespace: true`
279
- 5. Tokens: No delimiters
280
-
281
- ### Properties
282
-
283
- <ResponseField name="levels" type="RecursiveLevel[]">
284
- Array of recursive levels
285
- </ResponseField>
286
-
287
- <ResponseField name="length" type="number" getter>
288
- Number of levels in the rules
289
- </ResponseField>
290
-
291
- ```typescript
292
- const rules = new ExuluChunkers.recursive.rules();
293
-
294
- console.log(rules.length); // 5 (default levels)
295
- console.log(rules.levels[0]); // First level (paragraphs)
296
- ```
297
-
298
- ### Methods
299
-
300
- #### getLevel()
301
-
302
- Get a level by index.
303
-
304
- ```typescript
305
- getLevel(index: number): RecursiveLevel | undefined
306
- ```
307
-
308
- <ParamField path="index" type="number" required>
309
- The index of the level to retrieve
310
- </ParamField>
311
-
312
- <ResponseField name="return" type="RecursiveLevel | undefined">
313
- The level at the specified index, or undefined if not found
314
- </ResponseField>
315
-
316
- ```typescript
317
- const rules = new ExuluChunkers.recursive.rules();
318
-
319
- const firstLevel = rules.getLevel(0); // Paragraphs level
320
- const secondLevel = rules.getLevel(1); // Sentences level
321
- const invalid = rules.getLevel(999); // undefined
322
- ```
323
-
324
- #### toDict()
325
-
326
- Convert rules to a dictionary-like object.
327
-
328
- ```typescript
329
- toDict(): RecursiveRulesData
330
- ```
331
-
332
- <ResponseField name="return" type="RecursiveRulesData">
333
- Dictionary representation of the rules
334
- </ResponseField>
335
-
336
- ```typescript
337
- const rules = new ExuluChunkers.recursive.rules({
338
- levels: [
339
- { delimiters: ["\n\n"] },
340
- { whitespace: true }
341
- ]
342
- });
343
-
344
- const dict = rules.toDict();
345
- console.log(dict);
346
- // { levels: [{ delimiters: ["\n\n"], whitespace: false, includeDelim: "prev" }, ...] }
347
- ```
348
-
349
- #### fromDict()
350
-
351
- Create RecursiveRules from a dictionary.
352
-
353
- ```typescript
354
- static fromDict(data: RecursiveRulesData): RecursiveRules
355
- ```
356
-
357
- <ParamField path="data" type="RecursiveRulesData" required>
358
- Dictionary representation of rules
359
- </ParamField>
360
-
361
- <ResponseField name="return" type="RecursiveRules">
362
- New RecursiveRules instance
363
- </ResponseField>
364
-
365
- ```typescript
366
- const data = {
367
- levels: [
368
- { delimiters: ["\n\n"] },
369
- { whitespace: true }
370
- ]
371
- };
372
-
373
- const rules = ExuluChunkers.recursive.rules.fromDict(data);
374
- ```
375
-
376
- #### toString()
377
-
378
- String representation of the rules.
379
-
380
- ```typescript
381
- toString(): string
382
- ```
383
-
384
- <ResponseField name="return" type="string">
385
- String representation
386
- </ResponseField>
387
-
388
- ```typescript
389
- const rules = new ExuluChunkers.recursive.rules();
390
- console.log(rules.toString());
391
- // "RecursiveRules(levels=[...])"
392
- ```
393
-
394
- #### Symbol.iterator
395
-
396
- The rules object is iterable:
397
-
398
- ```typescript
399
- for (const level of rules) {
400
- console.log(level.delimiters);
401
- console.log(level.whitespace);
402
- }
403
- ```
404
-
405
- ## RecursiveLevel
406
-
407
- Class representing a single level in the recursive hierarchy.
408
-
409
- ### Constructor
410
-
411
- ```typescript
412
- new RecursiveLevel(data?: RecursiveLevelData)
413
- ```
414
-
415
- <ParamField path="data" type="RecursiveLevelData">
416
- Configuration for the level
417
- </ParamField>
418
-
419
- <ParamField path="data.delimiters" type="string | string[]">
420
- Delimiter(s) to use for splitting at this level
421
- </ParamField>
422
-
423
- <ParamField path="data.whitespace" type="boolean" default={false}>
424
- Whether to split on whitespace (default: false)
425
- </ParamField>
426
-
427
- <ParamField path="data.includeDelim" type="'prev' | 'next'" default="prev">
428
- Whether to include delimiter in previous or next chunk (default: "prev")
429
- </ParamField>
430
-
431
- ```typescript
432
- // Single delimiter
433
- const level1 = new RecursiveLevel({
434
- delimiters: "\n\n"
435
- });
436
-
437
- // Multiple delimiters
438
- const level2 = new RecursiveLevel({
439
- delimiters: [". ", "! ", "? "],
440
- includeDelim: "prev"
441
- });
442
-
443
- // Whitespace splitting
444
- const level3 = new RecursiveLevel({
445
- whitespace: true
446
- });
447
-
448
- // No delimiters (token-level fallback)
449
- const level4 = new RecursiveLevel();
450
- ```
451
-
452
- <Warning>
453
- Cannot use both `delimiters` and `whitespace` in the same level. They are mutually exclusive.
454
- </Warning>
455
-
456
- ### Properties
457
-
458
- <ResponseField name="delimiters" type="string | string[] | undefined">
459
- Custom delimiters for chunking
460
- </ResponseField>
461
-
462
- <ResponseField name="whitespace" type="boolean">
463
- Whether to use whitespace as a delimiter
464
- </ResponseField>
465
-
466
- <ResponseField name="includeDelim" type="'prev' | 'next'">
467
- Where to include the delimiter
468
- </ResponseField>
469
-
470
- ```typescript
471
- const level = new RecursiveLevel({
472
- delimiters: [". ", "! ", "? "],
473
- includeDelim: "prev"
474
- });
475
-
476
- console.log(level.delimiters); // [". ", "! ", "? "]
477
- console.log(level.whitespace); // false
478
- console.log(level.includeDelim); // "prev"
479
- ```
480
-
481
- ### Methods
482
-
483
- #### toDict()
484
-
485
- Convert level to dictionary.
486
-
487
- ```typescript
488
- toDict(): RecursiveLevelData
489
- ```
490
-
491
- <ResponseField name="return" type="RecursiveLevelData">
492
- Dictionary representation
493
- </ResponseField>
494
-
495
- ```typescript
496
- const level = new RecursiveLevel({ delimiters: [". "] });
497
- const dict = level.toDict();
498
- console.log(dict);
499
- // { delimiters: [". "], whitespace: false, includeDelim: "prev" }
500
- ```
501
-
502
- #### fromDict()
503
-
504
- Create RecursiveLevel from dictionary.
505
-
506
- ```typescript
507
- static fromDict(data: RecursiveLevelData): RecursiveLevel
508
- ```
509
-
510
- <ParamField path="data" type="RecursiveLevelData" required>
511
- Dictionary representation
512
- </ParamField>
513
-
514
- <ResponseField name="return" type="RecursiveLevel">
515
- New RecursiveLevel instance
516
- </ResponseField>
517
-
518
- ```typescript
519
- const data = { delimiters: [". "], includeDelim: "next" };
520
- const level = RecursiveLevel.fromDict(data);
521
- ```
522
-
523
- #### toString()
524
-
525
- String representation of the level.
526
-
527
- ```typescript
528
- toString(): string
529
- ```
530
-
531
- <ResponseField name="return" type="string">
532
- String representation
533
- </ResponseField>
534
-
535
- ```typescript
536
- const level = new RecursiveLevel({ delimiters: [". "] });
537
- console.log(level.toString());
538
- // "RecursiveLevel(delimiters=["."], whitespace=false, includeDelim=prev)"
539
- ```
540
-
541
- ## Chunk
542
-
543
- Base class for text chunks.
544
-
545
- ### Properties
546
-
547
- <ResponseField name="text" type="string">
548
- The chunk text
549
- </ResponseField>
550
-
551
- <ResponseField name="startIndex" type="number">
552
- Starting index in the original text
553
- </ResponseField>
554
-
555
- <ResponseField name="endIndex" type="number">
556
- Ending index in the original text
557
- </ResponseField>
558
-
559
- <ResponseField name="tokenCount" type="number">
560
- Number of tokens in the chunk
561
- </ResponseField>
562
-
563
- <ResponseField name="embedding" type="number[] | undefined">
564
- Optional embedding vector for the chunk
565
- </ResponseField>
566
-
567
- ```typescript
568
- const chunk = chunks[0];
569
-
570
- console.log(chunk.text); // "This is the first chunk..."
571
- console.log(chunk.startIndex); // 0
572
- console.log(chunk.endIndex); // 245
573
- console.log(chunk.tokenCount); // 48
574
- console.log(chunk.embedding); // undefined (or embedding array)
575
- ```
576
-
577
- ### Methods
578
-
579
- #### toString()
580
-
581
- String representation of the chunk (returns the text).
582
-
583
- ```typescript
584
- toString(): string
585
- ```
586
-
587
- <ResponseField name="return" type="string">
588
- The chunk text
589
- </ResponseField>
590
-
591
- ```typescript
592
- console.log(chunk.toString()); // "This is the first chunk..."
593
- ```
594
-
595
- #### toRepresentation()
596
-
597
- Detailed string representation.
598
-
599
- ```typescript
600
- toRepresentation(): string
601
- ```
602
-
603
- <ResponseField name="return" type="string">
604
- Detailed representation
605
- </ResponseField>
606
-
607
- ```typescript
608
- console.log(chunk.toRepresentation());
609
- // "Chunk(text='...', tokenCount=48, startIndex=0, endIndex=245)"
610
- ```
611
-
612
- #### slice()
613
-
614
- Get a slice of the chunk's text.
615
-
616
- ```typescript
617
- slice(start?: number, end?: number): string
618
- ```
619
-
620
- <ParamField path="start" type="number">
621
- Starting index for the slice
622
- </ParamField>
623
-
624
- <ParamField path="end" type="number">
625
- Ending index for the slice
626
- </ParamField>
627
-
628
- <ResponseField name="return" type="string">
629
- Sliced text
630
- </ResponseField>
631
-
632
- ```typescript
633
- const chunk = chunks[0];
634
- console.log(chunk.slice(0, 50)); // First 50 characters
635
- ```
636
-
637
- #### toDict()
638
-
639
- Convert chunk to dictionary.
640
-
641
- ```typescript
642
- toDict(): ChunkData
643
- ```
644
-
645
- <ResponseField name="return" type="ChunkData">
646
- Dictionary representation
647
- </ResponseField>
648
-
649
- ```typescript
650
- const dict = chunk.toDict();
651
- console.log(dict);
652
- // { text: "...", startIndex: 0, endIndex: 245, tokenCount: 48, embedding: undefined }
653
- ```
654
-
655
- #### fromDict()
656
-
657
- Create Chunk from dictionary.
658
-
659
- ```typescript
660
- static fromDict(data: ChunkData): Chunk
661
- ```
662
-
663
- <ParamField path="data" type="ChunkData" required>
664
- Dictionary representation
665
- </ParamField>
666
-
667
- <ResponseField name="return" type="Chunk">
668
- New Chunk instance
669
- </ResponseField>
670
-
671
- ```typescript
672
- const data = {
673
- text: "Sample text",
674
- startIndex: 0,
675
- endIndex: 11,
676
- tokenCount: 3
677
- };
678
-
679
- const chunk = Chunk.fromDict(data);
680
- ```
681
-
682
- #### copy()
683
-
684
- Create a deep copy of the chunk.
685
-
686
- ```typescript
687
- copy(): Chunk
688
- ```
689
-
690
- <ResponseField name="return" type="Chunk">
691
- Deep copy of the chunk
692
- </ResponseField>
693
-
694
- ```typescript
695
- const original = chunks[0];
696
- const copy = original.copy();
697
-
698
- console.log(copy.text === original.text); // true
699
- console.log(copy === original); // false (different objects)
700
- ```
701
-
702
- ## RecursiveChunk
703
-
704
- Extends `Chunk` with recursion level tracking.
705
-
706
- ### Properties
707
-
708
- All properties from `Chunk`, plus:
709
-
710
- <ResponseField name="level" type="number | undefined">
711
- The recursion level at which this chunk was created
712
- </ResponseField>
713
-
714
- ```typescript
715
- const chunk = chunks[0];
716
-
717
- console.log(chunk.text); // "This is the first chunk..."
718
- console.log(chunk.tokenCount); // 48
719
- console.log(chunk.level); // 0 (split at top level)
720
- ```
721
-
722
- **Level interpretation:**
723
- - `0`: Split at first level (e.g., paragraphs)
724
- - `1`: Split at second level (e.g., sentences)
725
- - `2`: Split at third level (e.g., pauses)
726
- - etc.
727
-
728
- ### Methods
729
-
730
- All methods from `Chunk`, with overridden implementations that preserve the `level` property.
731
-
732
- ## Usage examples
733
-
734
- ### Basic sentence chunking
735
-
736
- ```typescript
737
- import { ExuluChunkers } from "@exulu/backend";
738
-
739
- const chunker = await ExuluChunkers.sentence.create({
740
- chunkSize: 512,
741
- chunkOverlap: 50
742
- });
743
-
744
- const text = `
745
- Artificial intelligence is transforming industries worldwide.
746
- Machine learning enables computers to learn from data without
747
- explicit programming. Deep learning uses neural networks to
748
- recognize complex patterns in images, text, and audio.
749
-
750
- The field continues to evolve rapidly. New techniques emerge
751
- regularly, pushing the boundaries of what's possible.
752
- `;
753
-
754
- const chunks = await chunker(text);
755
-
756
- console.log(`Created ${chunks.length} chunks`);
757
-
758
- for (const [i, chunk] of chunks.entries()) {
759
- console.log(`\nChunk ${i + 1}:`);
760
- console.log(` Text: ${chunk.text.slice(0, 50)}...`);
761
- console.log(` Tokens: ${chunk.tokenCount}`);
762
- console.log(` Range: ${chunk.startIndex}-${chunk.endIndex}`);
763
- }
764
- ```
765
-
766
- ### Recursive chunking with custom rules
767
-
768
- ```typescript
769
- import { ExuluChunkers } from "@exulu/backend";
770
-
771
- // Define custom rules for markdown
772
- const rules = new ExuluChunkers.recursive.rules({
773
- levels: [
774
- // Split by headers (keep header with content)
775
- {
776
- delimiters: ["\n## ", "\n### "],
777
- includeDelim: "next"
778
- },
779
- // Split by paragraphs
780
- { delimiters: ["\n\n"] },
781
- // Split by sentences
782
- { delimiters: [". ", "! ", "? "] },
783
- // Split by words
784
- { whitespace: true }
785
- ]
786
- });
787
-
788
- const chunker = await ExuluChunkers.recursive.function.create({
789
- chunkSize: 1024,
790
- rules: rules,
791
- minCharactersPerChunk: 75
792
- });
793
-
794
- const markdown = `
795
- ## Introduction
796
-
797
- Machine learning is a subset of artificial intelligence.
798
- It enables systems to learn and improve from experience.
799
-
800
- ## Applications
801
-
802
- Recommendation systems use ML to personalize content.
803
- Fraud detection systems identify suspicious patterns.
804
- Autonomous vehicles use ML for navigation and decision-making.
805
-
806
- ## Future Directions
807
-
808
- The field continues to advance rapidly.
809
- New architectures and techniques emerge regularly.
810
- `;
811
-
812
- const chunks = await chunker(markdown);
813
-
814
- console.log(`Created ${chunks.length} chunks`);
815
-
816
- for (const [i, chunk] of chunks.entries()) {
817
- console.log(`\nChunk ${i + 1} (level ${chunk.level}):`);
818
- console.log(` Text: ${chunk.text}`);
819
- console.log(` Tokens: ${chunk.tokenCount}`);
820
- }
821
- ```
822
-
823
- ### Analyzing chunk statistics
824
-
825
- ```typescript
826
- const chunker = await ExuluChunkers.sentence.create({
827
- chunkSize: 512,
828
- chunkOverlap: 50
829
- });
830
-
831
- const text = "Your long document...";
832
- const chunks = await chunker(text);
833
-
834
- // Calculate statistics
835
- const tokenCounts = chunks.map(c => c.tokenCount);
836
- const avgTokens = tokenCounts.reduce((a, b) => a + b, 0) / chunks.length;
837
- const maxTokens = Math.max(...tokenCounts);
838
- const minTokens = Math.min(...tokenCounts);
839
-
840
- console.log(`Chunks: ${chunks.length}`);
841
- console.log(`Avg tokens: ${avgTokens.toFixed(2)}`);
842
- console.log(`Max tokens: ${maxTokens}`);
843
- console.log(`Min tokens: ${minTokens}`);
844
- console.log(`Total tokens: ${tokenCounts.reduce((a, b) => a + b, 0)}`);
845
-
846
- // Distribution
847
- const histogram = {};
848
- for (const chunk of chunks) {
849
- const bucket = Math.floor(chunk.tokenCount / 100) * 100;
850
- histogram[bucket] = (histogram[bucket] || 0) + 1;
851
- }
852
-
853
- console.log("\nToken distribution:");
854
- for (const [bucket, count] of Object.entries(histogram)) {
855
- console.log(` ${bucket}-${parseInt(bucket) + 99}: ${'*'.repeat(count)}`);
856
- }
857
- ```
858
-
859
- ### Inspecting level distribution (recursive)
860
-
861
- ```typescript
862
- const chunker = await ExuluChunkers.recursive.function.create({
863
- chunkSize: 1024
864
- });
865
-
866
- const text = "Your document...";
867
- const chunks = await chunker(text);
868
-
869
- // Count chunks by level
870
- const levelCounts = {};
871
- for (const chunk of chunks) {
872
- levelCounts[chunk.level || 0] = (levelCounts[chunk.level || 0] || 0) + 1;
873
- }
874
-
875
- console.log("Chunk distribution by level:");
876
- for (const [level, count] of Object.entries(levelCounts)) {
877
- const levelName = ["Paragraphs", "Sentences", "Pauses", "Words", "Tokens"][level];
878
- console.log(` Level ${level} (${levelName}): ${count} chunks`);
879
- }
880
- ```
881
-
882
- ### Using with ExuluContext
883
-
884
- ```typescript
885
- import { ExuluContext, ExuluChunkers, ExuluEmbedder } from "@exulu/backend";
886
-
887
- // Create chunker
888
- const chunker = await ExuluChunkers.sentence.create({
889
- chunkSize: 512,
890
- chunkOverlap: 75
891
- });
892
-
893
- // Create embedder
894
- const embedder = new ExuluEmbedder({
895
- id: "openai_embedder",
896
- name: "OpenAI Embeddings",
897
- provider: "openai",
898
- model: "text-embedding-3-small",
899
- vectorDimensions: 1536
900
- });
901
-
902
- // Create context with chunker
903
- const context = new ExuluContext({
904
- id: "documentation",
905
- name: "Product Documentation",
906
- description: "Searchable product documentation",
907
- embedder: embedder,
908
- chunker: chunker, // Documents will be chunked automatically
909
- fields: [
910
- { name: "title", type: "text", required: true },
911
- { name: "content", type: "longtext", required: true },
912
- { name: "url", type: "text", required: false }
913
- ],
914
- sources: []
915
- });
916
-
917
- // Add document - it's automatically chunked and embedded
918
- await context.createItem(
919
- {
920
- title: "Getting Started Guide",
921
- content: "Very long documentation content...",
922
- url: "https://example.com/docs/getting-started"
923
- },
924
- { generateEmbeddings: true }
925
- );
926
-
927
- // Search returns relevant chunks
928
- const results = await context.search({
929
- query: "How do I install?",
930
- limit: 5
931
- });
932
-
933
- for (const result of results) {
934
- console.log(`Score: ${result.score}`);
935
- console.log(`Chunk: ${result.chunk.text.slice(0, 100)}...`);
936
- }
937
- ```
938
-
939
- ## Type definitions
940
-
941
- ```typescript
942
- // Sentence chunker options
943
- interface SentenceChunkerOptions {
944
- chunkSize: number;
945
- chunkOverlap?: number;
946
- minSentencesPerChunk?: number;
947
- minCharactersPerSentence?: number;
948
- }
949
-
950
- // Recursive chunker options
951
- interface RecursiveChunkerOptions {
952
- chunkSize: number;
953
- rules?: RecursiveRules;
954
- minCharactersPerChunk?: number;
955
- }
956
-
957
- // Recursive rules data
958
- interface RecursiveRulesData {
959
- levels?: RecursiveLevelData[];
960
- }
961
-
962
- // Recursive level data
963
- interface RecursiveLevelData {
964
- delimiters?: string | string[];
965
- whitespace?: boolean;
966
- includeDelim?: "prev" | "next";
967
- }
968
-
969
- // Chunk data
970
- interface ChunkData {
971
- text: string;
972
- startIndex: number;
973
- endIndex: number;
974
- tokenCount: number;
975
- embedding?: number[];
976
- }
977
-
978
- // Recursive chunk data
979
- interface RecursiveChunkData extends ChunkData {
980
- level?: number;
981
- }
982
- ```
983
-
984
- ## Best practices
985
-
986
- <Tip>
987
- **Use appropriate chunk size**: Match your embedding model's token limit. Leave 10-20% headroom for metadata.
988
- </Tip>
989
-
990
- <Note>
991
- **Enable overlap for natural language**: Use 10-20% overlap to preserve context at chunk boundaries.
992
- </Note>
993
-
994
- <Warning>
995
- **Monitor chunk count**: More chunks = higher embedding costs. Balance granularity with cost.
996
- </Warning>
997
-
998
- <Info>
999
- **Choose the right chunker**: SentenceChunker for most text, RecursiveChunker for structured documents.
1000
- </Info>
1001
-
1002
- ## Next steps
1003
-
1004
- <CardGroup cols={2}>
1005
- <Card title="Configuration guide" icon="gear" href="/core/exulu-chunkers/configuration">
1006
- Learn about configuration options
1007
- </Card>
1008
- <Card title="Overview" icon="book" href="/core/exulu-chunkers/introduction">
1009
- Understand chunking concepts
1010
- </Card>
1011
- </CardGroup>