@mastra/rag 1.2.2 → 1.2.3-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CHANGELOG.md +22 -0
  2. package/dist/index.cjs +25 -9
  3. package/dist/index.cjs.map +1 -1
  4. package/dist/index.js +25 -9
  5. package/dist/index.js.map +1 -1
  6. package/dist/tools/graph-rag.d.ts.map +1 -1
  7. package/dist/tools/types.d.ts +18 -5
  8. package/dist/tools/types.d.ts.map +1 -1
  9. package/dist/tools/vector-query.d.ts.map +1 -1
  10. package/dist/utils/vector-search.d.ts +6 -7
  11. package/dist/utils/vector-search.d.ts.map +1 -1
  12. package/package.json +19 -6
  13. package/.turbo/turbo-build.log +0 -4
  14. package/docker-compose.yaml +0 -22
  15. package/eslint.config.js +0 -6
  16. package/src/document/document.test.ts +0 -2975
  17. package/src/document/document.ts +0 -335
  18. package/src/document/extractors/base.ts +0 -30
  19. package/src/document/extractors/index.ts +0 -5
  20. package/src/document/extractors/keywords.test.ts +0 -125
  21. package/src/document/extractors/keywords.ts +0 -126
  22. package/src/document/extractors/questions.test.ts +0 -120
  23. package/src/document/extractors/questions.ts +0 -111
  24. package/src/document/extractors/summary.test.ts +0 -107
  25. package/src/document/extractors/summary.ts +0 -122
  26. package/src/document/extractors/title.test.ts +0 -121
  27. package/src/document/extractors/title.ts +0 -185
  28. package/src/document/extractors/types.ts +0 -40
  29. package/src/document/index.ts +0 -2
  30. package/src/document/prompts/base.ts +0 -77
  31. package/src/document/prompts/format.ts +0 -9
  32. package/src/document/prompts/index.ts +0 -15
  33. package/src/document/prompts/prompt.ts +0 -60
  34. package/src/document/prompts/types.ts +0 -29
  35. package/src/document/schema/index.ts +0 -3
  36. package/src/document/schema/node.ts +0 -187
  37. package/src/document/schema/types.ts +0 -40
  38. package/src/document/transformers/character.ts +0 -267
  39. package/src/document/transformers/html.ts +0 -346
  40. package/src/document/transformers/json.ts +0 -536
  41. package/src/document/transformers/latex.ts +0 -11
  42. package/src/document/transformers/markdown.ts +0 -239
  43. package/src/document/transformers/semantic-markdown.ts +0 -227
  44. package/src/document/transformers/sentence.ts +0 -314
  45. package/src/document/transformers/text.ts +0 -158
  46. package/src/document/transformers/token.ts +0 -137
  47. package/src/document/transformers/transformer.ts +0 -5
  48. package/src/document/types.ts +0 -145
  49. package/src/document/validation.ts +0 -158
  50. package/src/graph-rag/index.test.ts +0 -235
  51. package/src/graph-rag/index.ts +0 -306
  52. package/src/index.ts +0 -8
  53. package/src/rerank/index.test.ts +0 -150
  54. package/src/rerank/index.ts +0 -198
  55. package/src/rerank/relevance/cohere/index.ts +0 -56
  56. package/src/rerank/relevance/index.ts +0 -3
  57. package/src/rerank/relevance/mastra-agent/index.ts +0 -32
  58. package/src/rerank/relevance/zeroentropy/index.ts +0 -26
  59. package/src/tools/README.md +0 -153
  60. package/src/tools/document-chunker.ts +0 -34
  61. package/src/tools/graph-rag.test.ts +0 -115
  62. package/src/tools/graph-rag.ts +0 -154
  63. package/src/tools/index.ts +0 -3
  64. package/src/tools/types.ts +0 -110
  65. package/src/tools/vector-query-database-config.test.ts +0 -190
  66. package/src/tools/vector-query.test.ts +0 -418
  67. package/src/tools/vector-query.ts +0 -169
  68. package/src/utils/convert-sources.ts +0 -43
  69. package/src/utils/default-settings.ts +0 -38
  70. package/src/utils/index.ts +0 -3
  71. package/src/utils/tool-schemas.ts +0 -38
  72. package/src/utils/vector-prompts.ts +0 -832
  73. package/src/utils/vector-search.ts +0 -117
  74. package/tsconfig.build.json +0 -9
  75. package/tsconfig.json +0 -5
  76. package/tsup.config.ts +0 -17
  77. package/vitest.config.ts +0 -8
@@ -1,2975 +0,0 @@
1
- import { createOpenAI } from '@ai-sdk/openai';
2
- import { embedMany } from 'ai';
3
- import { describe, it, expect, vi } from 'vitest';
4
-
5
- import { MDocument } from './document';
6
- import { Language } from './types';
7
-
8
- const sampleMarkdown = `
9
- # Complete Guide to Modern Web Development
10
- ## Introduction
11
- Welcome to our comprehensive guide on modern web development. This resource covers essential concepts, best practices, and tools that every developer should know in 2024.
12
-
13
- ### Who This Guide Is For
14
- - Beginning developers looking to establish a solid foundation
15
- - Intermediate developers wanting to modernize their skillset
16
- - Senior developers seeking a refresher on current best practices
17
- `;
18
-
19
- const openai = createOpenAI({
20
- apiKey: process.env.OPENAI_API_KEY,
21
- });
22
-
23
- vi.setConfig({ testTimeout: 100_000, hookTimeout: 100_000 });
24
-
25
- describe('MDocument', () => {
26
- describe('basics', () => {
27
- let chunks: MDocument['chunks'];
28
- let doc: MDocument;
29
- it('initialization', () => {
30
- const doc = new MDocument({ docs: [{ text: 'test' }], type: 'text' });
31
- expect(doc.getDocs()).toHaveLength(1);
32
- expect(doc.getText()?.[0]).toBe('test');
33
- });
34
-
35
- it('initialization with array', () => {
36
- doc = new MDocument({ docs: [{ text: 'test' }, { text: 'test2' }], type: 'text' });
37
- expect(doc.getDocs()).toHaveLength(2);
38
- expect(doc.getDocs()[0]?.text).toBe('test');
39
- expect(doc.getDocs()[1]?.text).toBe('test2');
40
- });
41
-
42
- it('chunk - metadata title', async () => {
43
- const doc = MDocument.fromMarkdown(sampleMarkdown);
44
-
45
- chunks = await doc.chunk({
46
- maxSize: 1500,
47
- overlap: 0,
48
- extract: {
49
- keywords: true,
50
- },
51
- });
52
-
53
- expect(doc.getMetadata()?.[0]).toBeTruthy();
54
- expect(chunks).toBeInstanceOf(Array);
55
- }, 15000);
56
-
57
- it('embed - create embedding from chunk', async () => {
58
- const embeddings = await embedMany({
59
- values: chunks.map(chunk => chunk.text),
60
- model: openai.embedding('text-embedding-3-small'),
61
- });
62
-
63
- expect(embeddings).toBeDefined();
64
- }, 15000);
65
- });
66
-
67
- describe('chunkCharacter', () => {
68
- it('should split text on simple separator', async () => {
69
- const text = 'Hello world\n\nHow are you\n\nI am fine';
70
-
71
- const doc = MDocument.fromText(text, { meta: 'data' });
72
-
73
- await doc.chunk({
74
- strategy: 'character',
75
- separator: '\n\n',
76
- isSeparatorRegex: false,
77
- maxSize: 50,
78
- overlap: 5,
79
- });
80
-
81
- const chunks = doc.getDocs();
82
-
83
- expect(chunks).toHaveLength(3);
84
- expect(chunks?.[0]?.text).toBe('Hello world');
85
- expect(chunks?.[1]?.text).toBe('How are you');
86
- expect(chunks?.[2]?.text).toBe('I am fine');
87
- });
88
-
89
- it('should handle regex separator', async () => {
90
- const text = 'Hello world\n\nHow are you';
91
-
92
- const doc = MDocument.fromText(text, { meta: 'data' });
93
-
94
- await doc.chunk({
95
- strategy: 'character',
96
- separator: '\\s+',
97
- isSeparatorRegex: true,
98
- maxSize: 50,
99
- overlap: 5,
100
- });
101
-
102
- expect(doc.getText().join(' ')).toBe('Hello world How are you');
103
- });
104
-
105
- it('should keep separator when specified', async () => {
106
- const text = 'Hello\n\nWorld';
107
-
108
- const doc = MDocument.fromText(text, { meta: 'data' });
109
-
110
- await doc.chunk({
111
- strategy: 'character',
112
- separator: '\n\n',
113
- isSeparatorRegex: false,
114
- maxSize: 50,
115
- overlap: 5,
116
- keepSeparator: 'end',
117
- });
118
- const chunks = doc.getText();
119
-
120
- expect(chunks[0]).toBe('Hello\n\n');
121
- expect(chunks[1]).toBe('World');
122
- });
123
-
124
- describe('separator handling', () => {
125
- it('should keep separator at end when specified', async () => {
126
- const text = 'Hello\n\nWorld';
127
-
128
- const doc = MDocument.fromText(text, { meta: 'data' });
129
-
130
- await doc.chunk({
131
- strategy: 'character',
132
- separator: '\n\n',
133
- isSeparatorRegex: false,
134
- maxSize: 50,
135
- overlap: 5,
136
- keepSeparator: 'end',
137
- });
138
-
139
- const chunks = doc.getText();
140
-
141
- expect(chunks).toHaveLength(2);
142
- expect(chunks[0]).toBe('Hello\n\n');
143
- expect(chunks[1]).toBe('World');
144
- });
145
-
146
- it('should keep separator at start when specified', async () => {
147
- const text = 'Hello\n\nWorld\n\nTest';
148
-
149
- const doc = MDocument.fromText(text, { meta: 'data' });
150
-
151
- await doc.chunk({
152
- strategy: 'character',
153
- separator: '\n\n',
154
- isSeparatorRegex: false,
155
- maxSize: 50,
156
- overlap: 5,
157
- keepSeparator: 'start',
158
- });
159
-
160
- const chunks = doc.getText();
161
-
162
- expect(chunks).toHaveLength(3);
163
- expect(chunks[0]).toBe('Hello');
164
- expect(chunks[1]).toBe('\n\nWorld');
165
- expect(chunks[2]).toBe('\n\nTest');
166
- });
167
-
168
- it('should handle multiple consecutive separators', async () => {
169
- const text = 'Hello\n\n\n\nWorld';
170
-
171
- const doc = MDocument.fromText(text, { meta: 'data' });
172
-
173
- await doc.chunk({
174
- strategy: 'character',
175
- separator: '\n\n',
176
- isSeparatorRegex: false,
177
- maxSize: 50,
178
- overlap: 5,
179
- keepSeparator: 'end',
180
- });
181
-
182
- const chunks = doc.getText();
183
-
184
- expect(chunks.length).toBeGreaterThan(0);
185
- expect(chunks.join('')).toBe(text);
186
- });
187
-
188
- it('should handle text ending with separator', async () => {
189
- const text = 'Hello\n\nWorld\n\n';
190
-
191
- const doc = MDocument.fromText(text, { meta: 'data' });
192
-
193
- await doc.chunk({
194
- strategy: 'character',
195
- separator: '\n\n',
196
- isSeparatorRegex: false,
197
- maxSize: 50,
198
- overlap: 5,
199
- keepSeparator: 'end',
200
- });
201
-
202
- const chunks = doc.getText();
203
-
204
- expect(chunks.length).toBeGreaterThan(0);
205
- expect(chunks.join('')).toBe(text);
206
- });
207
-
208
- it('should handle text starting with separator', async () => {
209
- const text = '\n\nHello\n\nWorld';
210
-
211
- const doc = MDocument.fromText(text, { meta: 'data' });
212
-
213
- await doc.chunk({
214
- strategy: 'character',
215
- separator: '\n\n',
216
- isSeparatorRegex: false,
217
- maxSize: 50,
218
- overlap: 5,
219
- keepSeparator: 'start',
220
- });
221
-
222
- const chunks = doc.getText();
223
-
224
- expect(chunks.length).toBeGreaterThan(0);
225
- expect(chunks.join('')).toBe(text);
226
- });
227
- });
228
- it('should properly implement overlap in character chunking', async () => {
229
- // Test basic overlap functionality
230
- const text = 'a'.repeat(500) + 'b'.repeat(500) + 'c'.repeat(500);
231
- const chunkSize = 600;
232
- const overlap = 100;
233
- const doc = MDocument.fromText(text);
234
-
235
- const result = await doc.chunk({
236
- strategy: 'character',
237
- maxSize: chunkSize,
238
- overlap,
239
- });
240
-
241
- // Verify overlap between chunks
242
- for (let i = 1; i < result.length; i++) {
243
- const prevChunk = result[i - 1]?.text;
244
- const currentChunk = result[i]?.text;
245
-
246
- if (prevChunk && currentChunk) {
247
- // Get the end of the previous chunk and start of current chunk
248
- const prevEnd = prevChunk.slice(-overlap);
249
- const currentStart = currentChunk.slice(0, overlap);
250
-
251
- // There should be a common substring of length >= min(overlap, chunk length)
252
- const commonSubstring = findCommonSubstring(prevEnd, currentStart);
253
- expect(commonSubstring.length).toBeGreaterThan(0);
254
- }
255
- }
256
- });
257
-
258
- it('should ensure character chunks never exceed size limit', async () => {
259
- // Create text with varying content to test size limits
260
- const text = 'a'.repeat(50) + 'b'.repeat(100) + 'c'.repeat(30);
261
- const chunkSize = 50;
262
- const overlap = 10;
263
-
264
- const doc = MDocument.fromText(text);
265
- const chunks = await doc.chunk({
266
- strategy: 'character',
267
- maxSize: chunkSize,
268
- overlap,
269
- });
270
-
271
- chunks.forEach((chunk, i) => {
272
- if (i > 0) {
273
- const prevChunk = chunks[i - 1]?.text;
274
- const actualOverlap = chunk.text.slice(0, overlap);
275
- const expectedOverlap = prevChunk?.slice(-overlap);
276
- expect(actualOverlap).toBe(expectedOverlap);
277
- }
278
- });
279
-
280
- // Verify each chunk's size
281
- let allChunksValid = true;
282
- for (const chunk of chunks) {
283
- if (chunk.text.length > chunkSize) {
284
- allChunksValid = false;
285
- }
286
- }
287
- expect(allChunksValid).toBe(true);
288
-
289
- // Verify overlaps between consecutive chunks
290
- for (let i = 1; i < chunks.length; i++) {
291
- const prevChunk = chunks[i - 1]!;
292
- const currentChunk = chunks[i]!;
293
-
294
- // The end of the previous chunk should match the start of the current chunk
295
- const prevEnd = prevChunk.text.slice(-overlap);
296
- const currentStart = currentChunk.text.slice(0, overlap);
297
-
298
- expect(currentStart).toBe(prevEnd);
299
- expect(currentStart.length).toBeLessThanOrEqual(overlap);
300
- }
301
- });
302
-
303
- it('should handle end chunks properly in character chunking', async () => {
304
- const text = 'This is a test document that needs to be split into chunks with proper handling of the end.';
305
- const chunkSize = 20;
306
- const overlap = 5;
307
-
308
- const testDoc = MDocument.fromText(text);
309
- const chunks = await testDoc.chunk({
310
- strategy: 'character',
311
- maxSize: chunkSize,
312
- overlap,
313
- });
314
-
315
- // Verify no tiny fragments at the end
316
- const lastChunk = chunks[chunks.length - 1]?.text;
317
- expect(lastChunk?.length).toBeGreaterThan(5);
318
-
319
- // Verify each chunk respects size limit
320
- let allChunksValid = true;
321
- for (const chunk of chunks) {
322
- if (chunk.text.length > chunkSize) {
323
- allChunksValid = false;
324
- }
325
- }
326
- expect(allChunksValid).toBe(true);
327
-
328
- // Verify the size of each chunk explicitly
329
- for (const chunk of chunks) {
330
- expect(chunk.text.length).toBeLessThanOrEqual(chunkSize);
331
- }
332
-
333
- // Verify overlaps between consecutive chunks
334
- for (let i = 1; i < chunks.length; i++) {
335
- const prevChunk = chunks[i - 1]!;
336
- const currentChunk = chunks[i]!;
337
-
338
- // The end of the previous chunk should match the start of the current chunk
339
- const prevEnd = prevChunk.text.slice(-overlap);
340
- const currentStart = currentChunk.text.slice(0, overlap);
341
-
342
- expect(currentStart).toBe(prevEnd);
343
- expect(currentStart.length).toBeLessThanOrEqual(overlap);
344
- }
345
- });
346
- it('should not create tiny chunks at the end', async () => {
347
- const text = 'ABCDEFGHIJ'; // 10 characters
348
- const chunkSize = 4;
349
- const overlap = 2;
350
-
351
- const doc = MDocument.fromText(text);
352
- const chunks = await doc.chunk({
353
- strategy: 'character',
354
- maxSize: chunkSize,
355
- overlap,
356
- });
357
-
358
- // Verify we don't have tiny chunks
359
- chunks.forEach(chunk => {
360
- // Each chunk should be either:
361
- // 1. Full size (chunkSize)
362
- // 2. Or at least half the chunk maxSize if it's the last chunk
363
- const minSize = chunk === chunks[chunks.length - 1] ? Math.floor(chunkSize / 2) : chunkSize;
364
- expect(chunk.text.length).toBeGreaterThanOrEqual(minSize);
365
- });
366
-
367
- // Verify overlaps are maintained
368
- for (let i = 1; i < chunks.length; i++) {
369
- const prevChunk = chunks[i - 1]!;
370
- const currentChunk = chunks[i]!;
371
- const actualOverlap = currentChunk.text.slice(0, overlap);
372
- const expectedOverlap = prevChunk.text.slice(-overlap);
373
- expect(actualOverlap).toBe(expectedOverlap);
374
- }
375
- });
376
- });
377
-
378
- describe('text transformer overlap', () => {
379
- it('should properly implement overlap in text splitting', async () => {
380
- // Create a text with distinct sections that will be split
381
- const text = 'Section1'.repeat(100) + '\n\n' + 'Section2'.repeat(100) + '\n\n' + 'Section3'.repeat(100);
382
- const size = 300;
383
- const overlapSize = 50;
384
- const doc = MDocument.fromText(text, { meta: 'data' });
385
-
386
- await doc.chunk({
387
- strategy: 'recursive',
388
- maxSize: size,
389
- overlap: overlapSize,
390
- separators: ['\n\n'], // Split on double newlines
391
- });
392
-
393
- const docs = doc.getDocs();
394
- expect(docs.length).toBeGreaterThan(1); // Should create multiple chunks
395
-
396
- for (let i = 1; i < docs.length; i++) {
397
- const prevChunk = docs[i - 1]?.text;
398
- const currentChunk = docs[i]?.text;
399
-
400
- if (prevChunk && currentChunk) {
401
- // Check if there's some overlap between chunks
402
- // We should find some common text between the end of the previous chunk
403
- // and the beginning of the current chunk
404
- const commonText = findCommonSubstring(prevChunk, currentChunk);
405
- expect(commonText.length).toBeGreaterThan(0);
406
- }
407
- }
408
- });
409
- });
410
-
411
- describe('chunkRecursive', () => {
412
- it('chunkRecursive', async () => {
413
- const text =
414
- 'Hello world.\n\nThis is a test of the recursive splitting system.\nIt should handle multiple lines and different separators appropriately.';
415
-
416
- const doc = MDocument.fromText(text, { meta: 'data' });
417
-
418
- await doc.chunk({
419
- strategy: 'recursive',
420
- separators: ['\n\n', '\n', ' ', ''],
421
- isSeparatorRegex: false,
422
- maxSize: 50,
423
- overlap: 5,
424
- });
425
-
426
- expect(doc.getDocs()?.length).toBeGreaterThan(1);
427
-
428
- doc.getText()?.forEach(t => {
429
- expect(t.length).toBeLessThanOrEqual(50);
430
- });
431
- });
432
-
433
- it('chunkRecursive - language options', async () => {
434
- const tsCode = `
435
- interface User {
436
- name: string;
437
- age: number;
438
- }
439
-
440
- function greet(user: User) {
441
- console.log(\`Hello \${user.name}\`);
442
- }
443
- `;
444
-
445
- const doc = MDocument.fromText(tsCode, { meta: 'data' });
446
-
447
- await doc.chunk({
448
- maxSize: 50,
449
- overlap: 5,
450
- language: Language.TS,
451
- });
452
-
453
- expect(doc.getDocs().length).toBeGreaterThan(1);
454
- expect(doc.getText().some(chunk => chunk.includes('interface'))).toBe(true);
455
- expect(doc.getText().some(chunk => chunk.includes('function'))).toBe(true);
456
- });
457
-
458
- it('should throw error for unsupported language', async () => {
459
- const doc = MDocument.fromText('tsCode', { meta: 'data' });
460
-
461
- await expect(
462
- doc.chunk({
463
- maxSize: 50,
464
- overlap: 5,
465
- language: 'invalid-language' as any,
466
- }),
467
- ).rejects.toThrow();
468
- });
469
-
470
- it('should maintain context with overlap', async () => {
471
- // Create a longer text that will definitely be split into multiple chunks
472
- const text =
473
- 'This is a test paragraph. '.repeat(50) +
474
- '\n\n' +
475
- 'This is a second paragraph with different content. '.repeat(50) +
476
- '\n\n' +
477
- 'This is a third paragraph with more unique content. '.repeat(50);
478
- const doc = MDocument.fromText(text, { meta: 'data' });
479
- const overlapSize = 20; // Explicit overlap size
480
-
481
- await doc.chunk({
482
- strategy: 'recursive',
483
- maxSize: 500, // Smaller chunk maxSize to ensure multiple chunks
484
- overlap: overlapSize,
485
- });
486
-
487
- const docs = doc.getDocs();
488
-
489
- // Ensure we have multiple chunks to test overlap
490
- expect(docs.length).toBeGreaterThan(1);
491
-
492
- for (let i = 1; i < docs.length; i++) {
493
- const prevChunk = docs[i - 1]?.text;
494
- const currentChunk = docs[i]?.text;
495
-
496
- if (prevChunk && currentChunk) {
497
- // Test using two methods:
498
-
499
- // 1. Check for shared words (original test)
500
- const hasWordOverlap = prevChunk.split(' ').some(word => word.length > 1 && currentChunk.includes(word));
501
-
502
- // 2. Check for shared character sequences
503
- const commonText = findCommonSubstring(prevChunk, currentChunk);
504
-
505
- // At least one of these overlap detection methods should succeed
506
- expect(hasWordOverlap || commonText.length > 5).toBe(true);
507
- }
508
- }
509
- });
510
-
511
- it('should respect the specified overlap size', async () => {
512
- const text = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'.repeat(10); // Long repeating text
513
- const chunkSize = 50;
514
- const overlapSize = 20;
515
- const doc = MDocument.fromText(text, { meta: 'data' });
516
-
517
- await doc.chunk({
518
- strategy: 'recursive',
519
- maxSize: chunkSize,
520
- overlap: overlapSize,
521
- });
522
-
523
- const docs = doc.getDocs();
524
- // Skip first chunk as it doesn't have a previous chunk to overlap with
525
- for (let i = 1; i < docs.length; i++) {
526
- const prevChunk = docs[i - 1]?.text;
527
- const currentChunk = docs[i]?.text;
528
-
529
- if (prevChunk && currentChunk) {
530
- // Get the end of the previous chunk
531
- const prevEnd = prevChunk.slice(-overlapSize);
532
- // Get the start of the current chunk
533
- const currentStart = currentChunk.slice(0, overlapSize);
534
-
535
- // There should be some overlap between the end of the previous chunk
536
- // and the start of the current chunk
537
- expect(prevEnd).toContain(currentStart.slice(0, 5));
538
- // The overlap shouldn't be the entire chunk
539
- expect(prevChunk).not.toBe(currentChunk);
540
- }
541
- }
542
- });
543
- });
544
-
545
- describe('chunkHTML', () => {
546
- it('should split HTML with headers correctly', async () => {
547
- const html = `
548
- <html>
549
- <body>
550
- <h1>Main Title</h1>
551
- <p>Main content.</p>
552
- <h2>Section 1</h2>
553
- <p>Section 1 content.</p>
554
- <h3>Subsection 1.1</h3>
555
- <p>Subsection content.</p>
556
- </body>
557
- </html>
558
- `;
559
-
560
- const doc = MDocument.fromHTML(html, { meta: 'data' });
561
-
562
- await doc.chunk({
563
- strategy: 'html',
564
- headers: [
565
- ['h1', 'Header 1'],
566
- ['h2', 'Header 2'],
567
- ['h3', 'Header 3'],
568
- ],
569
- });
570
-
571
- const docs = doc.getDocs();
572
- expect(docs.length).toBeGreaterThan(1);
573
- expect(docs?.[0]?.metadata?.['Header 1']).toBe('Main Title');
574
- expect(docs?.[1]?.metadata?.['Header 2']).toBe('Section 1');
575
- });
576
-
577
- it('should handle nested content', async () => {
578
- const html = `
579
- <html>
580
- <body>
581
- <h1>Title</h1>
582
- <div>
583
- <p>Nested content.</p>
584
- <div>
585
- <p>Deeply nested content.</p>
586
- </div>
587
- </div>
588
- </body>
589
- </html>
590
- `;
591
-
592
- const doc = MDocument.fromHTML(html, { meta: 'data' });
593
-
594
- await doc.chunk({
595
- strategy: 'html',
596
- headers: [
597
- ['h1', 'Header 1'],
598
- ['h2', 'Header 2'],
599
- ['h3', 'Header 3'],
600
- ],
601
- });
602
-
603
- const docs = doc.getDocs();
604
- const mainSection = docs.find(doc => doc.metadata?.['Header 1'] === 'Title');
605
- expect(mainSection?.text).toContain('Nested content');
606
- expect(mainSection?.text).toContain('Deeply nested content');
607
- });
608
-
609
- it('should respect returnEachElement option', async () => {
610
- const html = `
611
- <html>
612
- <body>
613
- <h1>Title</h1>
614
- <p>Paragraph 1</p>
615
- <h1>Title</h1>
616
- <p>Paragraph 2</p>
617
- <h1>Title</h1>
618
- <p>Paragraph 3</p>
619
- </body>
620
- </html>
621
- `;
622
-
623
- const doc = MDocument.fromHTML(html, { meta: 'data' });
624
-
625
- await doc.chunk({
626
- strategy: 'html',
627
-
628
- returnEachLine: true,
629
- headers: [
630
- ['h1', 'Header 1'],
631
- ['h2', 'Header 2'],
632
- ['h3', 'Header 3'],
633
- ],
634
- });
635
-
636
- const docs = doc.getDocs();
637
-
638
- expect(docs.length).toBeGreaterThan(2);
639
- docs.forEach(doc => {
640
- expect(doc.metadata?.['Header 1']).toBe('Title');
641
- });
642
- });
643
-
644
- it('should split HTML into sections', async () => {
645
- const html = `
646
- <html>
647
- <body>
648
- <h1>Document Title</h1>
649
- <p>Introduction text.</p>
650
- <h2>First Section</h2>
651
- <p>First section content.</p>
652
- <h2>Second Section</h2>
653
- <p>Second section content.</p>
654
- </body>
655
- </html>
656
- `;
657
-
658
- const doc = MDocument.fromHTML(html, { meta: 'data' });
659
-
660
- await doc.chunk({
661
- strategy: 'html',
662
- sections: [
663
- ['h1', 'Header 1'],
664
- ['h2', 'Header 2'],
665
- ],
666
- });
667
- const docs = doc.getDocs();
668
-
669
- expect(docs.length).toBe(3);
670
- expect(docs?.[0]?.metadata?.['Header 1']).toBe('Document Title');
671
- expect(docs?.[1]?.metadata?.['Header 2']).toBe('First Section');
672
- });
673
-
674
- it('should properly merge metadata', async () => {
675
- const doc = new MDocument({
676
- docs: [
677
- {
678
- text: `
679
- <h1>Title 1</h1>
680
- <p>Content 1</p>
681
- `,
682
- metadata: { source: 'doc1' },
683
- },
684
- {
685
- text: `
686
- <h1>Title 2</h1>
687
- <p>Content 2</p>
688
- `,
689
- metadata: { source: 'doc2' },
690
- },
691
- ],
692
- type: 'html',
693
- });
694
-
695
- await doc.chunk({
696
- strategy: 'html',
697
- sections: [
698
- ['h1', 'Header 1'],
699
- ['h2', 'Header 2'],
700
- ],
701
- });
702
-
703
- doc.getDocs().forEach(doc => {
704
- expect(doc?.metadata).toHaveProperty('source');
705
- expect(doc?.metadata).toHaveProperty('Header 1');
706
- });
707
- });
708
-
709
- it('should handle empty or invalid HTML', async () => {
710
- const emptyHtml = '';
711
- const invalidHtml = '<unclosed>test';
712
- const noHeadersHtml = '<div>test</div>';
713
-
714
- const doc1 = MDocument.fromHTML(emptyHtml, { meta: 'data' });
715
- const doc2 = MDocument.fromHTML(invalidHtml, { meta: 'data' });
716
- const doc3 = MDocument.fromHTML(noHeadersHtml, { meta: 'data' });
717
-
718
- await doc1.chunk({
719
- strategy: 'html',
720
- headers: [
721
- ['h1', 'Header 1'],
722
- ['h2', 'Header 2'],
723
- ],
724
- });
725
-
726
- await doc2.chunk({
727
- strategy: 'html',
728
- headers: [
729
- ['h1', 'Header 1'],
730
- ['h2', 'Header 2'],
731
- ],
732
- });
733
-
734
- await doc3.chunk({
735
- strategy: 'html',
736
- headers: [
737
- ['h1', 'Header 1'],
738
- ['h2', 'Header 2'],
739
- ],
740
- });
741
-
742
- expect(doc1.getDocs()).toHaveLength(0);
743
- expect(doc2.getDocs()).toHaveLength(0);
744
- expect(doc3.getDocs()).toHaveLength(0);
745
- });
746
-
747
- it('should handle complex nested header hierarchies', async () => {
748
- const html = `
749
- <html>
750
- <body>
751
- <h1>Main Title</h1>
752
- <p>Main content</p>
753
- <h2>Section 1</h2>
754
- <p>Section 1 content</p>
755
- <h3>Subsection 1.1</h3>
756
- <p>Subsection 1.1 content</p>
757
- <h2>Section 2</h2>
758
- <h3>Subsection 2.1</h3>
759
- <p>Subsection 2.1 content</p>
760
- </body>
761
- </html>
762
- `;
763
-
764
- const doc = MDocument.fromHTML(html, { meta: 'data' });
765
- await doc.chunk({
766
- strategy: 'html',
767
- headers: [
768
- ['h1', 'Header 1'],
769
- ['h2', 'Header 2'],
770
- ['h3', 'Header 3'],
771
- ],
772
- });
773
-
774
- const docs = doc.getDocs();
775
- expect(docs.length).toBeGreaterThan(3);
776
- expect(docs.some(d => d.metadata?.['Header 1'] === 'Main Title')).toBe(true);
777
- expect(docs.some(d => d.metadata?.['Header 2'] === 'Section 1')).toBe(true);
778
- expect(docs.some(d => d.metadata?.['Header 3'] === 'Subsection 1.1')).toBe(true);
779
- });
780
-
781
- it('should handle headers with mixed content and special characters', async () => {
782
- const html = `
783
- <html>
784
- <body>
785
- <h1>Title with <strong>bold</strong> &amp; <em>emphasis</em></h1>
786
- <p>Content 1</p>
787
- <h2>Section with &lt;tags&gt; &amp; symbols</h2>
788
- <p>Content 2</p>
789
- </body>
790
- </html>
791
- `;
792
-
793
- const doc = MDocument.fromHTML(html, { meta: 'data' });
794
- await doc.chunk({
795
- strategy: 'html',
796
- headers: [
797
- ['h1', 'Header 1'],
798
- ['h2', 'Header 2'],
799
- ],
800
- });
801
-
802
- const docs = doc.getDocs();
803
- expect(docs.length).toBeGreaterThan(1);
804
- expect(docs[0]?.metadata?.['Header 1']).toContain('bold');
805
- expect(docs[0]?.metadata?.['Header 1']).toContain('&');
806
- expect(docs[0]?.metadata?.['Header 1']).toContain('emphasis');
807
- expect(docs[1]?.metadata?.['Header 2']).toContain('<tags>');
808
- });
809
-
810
- it('should handle headers with no content or whitespace content', async () => {
811
- const html = `
812
- <html>
813
- <body>
814
- <h1>Empty Section</h1>
815
- <h2>Whitespace Section</h2>
816
-
817
- <h2>Valid Section</h2>
818
- <p>Content</p>
819
- </body>
820
- </html>
821
- `;
822
-
823
- const doc = MDocument.fromHTML(html, { meta: 'data' });
824
- await doc.chunk({
825
- strategy: 'html',
826
- headers: [
827
- ['h1', 'Header 1'],
828
- ['h2', 'Header 2'],
829
- ],
830
- });
831
-
832
- const docs = doc.getDocs();
833
- expect(docs.some(d => d.metadata?.['Header 1'] === 'Empty Section')).toBe(true);
834
- expect(docs.some(d => d.metadata?.['Header 2'] === 'Valid Section')).toBe(true);
835
- expect(docs.find(d => d.metadata?.['Header 2'] === 'Valid Section')?.text).toContain('Content');
836
- });
837
-
838
- it('should generate correct XPaths for deeply nested elements', async () => {
839
- const html = `
840
- <html>
841
- <body>
842
- <div class="container">
843
- <section id="main">
844
- <div>
845
- <h1>Deeply Nested Title</h1>
846
- <p>Content</p>
847
- </div>
848
- <div>
849
- <h1>Second Title</h1>
850
- <p>More Content</p>
851
- </div>
852
- </section>
853
- </div>
854
- </body>
855
- </html>
856
- `;
857
-
858
- const doc = MDocument.fromHTML(html, { meta: 'data' });
859
- await doc.chunk({
860
- strategy: 'html',
861
- headers: [['h1', 'Header 1']],
862
- });
863
-
864
- const docs = doc.getDocs();
865
- expect(docs).toHaveLength(2);
866
-
867
- // First h1
868
- expect(docs[0]?.metadata?.['Header 1']).toBe('Deeply Nested Title');
869
- const xpath1 = docs[0]?.metadata?.xpath as string;
870
- expect(xpath1).toBeDefined();
871
- expect(xpath1).toMatch(/^\/html\[1\]\/body\[1\]\/div\[1\]\/section\[1\]\/div\[1\]\/h1\[1\]$/);
872
-
873
- // Second h1
874
- expect(docs[1]?.metadata?.['Header 1']).toBe('Second Title');
875
- const xpath2 = docs[1]?.metadata?.xpath as string;
876
- expect(xpath2).toBeDefined();
877
- expect(xpath2).toMatch(/^\/html\[1\]\/body\[1\]\/div\[1\]\/section\[1\]\/div\[2\]\/h1\[1\]$/);
878
- });
879
- });
880
-
881
- describe('chunkJson', () => {
882
- describe('Unicode handling', () => {
883
- it('should handle Unicode characters correctly', async () => {
884
- const input = {
885
- key1: '你好',
886
- key2: '世界',
887
- };
888
-
889
- const doc = MDocument.fromJSON(JSON.stringify(input), { meta: 'data' });
890
-
891
- await doc.chunk({
892
- strategy: 'json',
893
- maxSize: 50,
894
- minSize: 50,
895
- ensureAscii: true,
896
- });
897
-
898
- expect(doc.getText().some(chunk => chunk.includes('\\u'))).toBe(true);
899
-
900
- const combined = doc
901
- .getText()
902
- .map(chunk => {
903
- const c = JSON.parse(chunk);
904
- const retVal: Record<string, string> = {};
905
- Object.entries(c).forEach(([key, value]) => {
906
- retVal[key] = JSON.parse(`"${value as string}"`);
907
- });
908
-
909
- return retVal;
910
- })
911
- .reduce((acc, curr) => ({ ...acc, ...curr }), {});
912
-
913
- expect(combined?.key1?.charCodeAt(0)).toBe('你'.charCodeAt(0));
914
- expect(combined?.key1?.charCodeAt(1)).toBe('好'.charCodeAt(0));
915
- expect(combined?.key2?.charCodeAt(0)).toBe('世'.charCodeAt(0));
916
- expect(combined?.key2?.charCodeAt(1)).toBe('界'.charCodeAt(0));
917
-
918
- expect(combined?.key1).toBe('你好');
919
- expect(combined?.key2).toBe('世界');
920
- });
921
-
922
- it('should handle non-ASCII without escaping when ensureAscii is false', async () => {
923
- const input = {
924
- key1: '你好',
925
- key2: '世界',
926
- };
927
-
928
- const doc = MDocument.fromJSON(JSON.stringify(input), { meta: 'data' });
929
-
930
- await doc.chunk({
931
- strategy: 'json',
932
- maxSize: 50,
933
- ensureAscii: false,
934
- });
935
-
936
- expect(doc.getText().some(chunk => chunk.includes('你好'))).toBe(true);
937
-
938
- const combined = doc
939
- .getText()
940
- .map(chunk => JSON.parse(chunk))
941
- .reduce((acc, curr) => ({ ...acc, ...curr }), {});
942
-
943
- expect(combined.key1).toBe('你好');
944
- expect(combined.key2).toBe('世界');
945
- });
946
- });
947
-
948
- describe('JSON structure handling', () => {
949
- it('should handle flat objects', async () => {
950
- const flatJson = {
951
- name: 'John',
952
- age: 30,
953
- email: 'john@example.com',
954
- };
955
-
956
- const doc = MDocument.fromJSON(JSON.stringify(flatJson), { meta: 'data' });
957
- await doc.chunk({
958
- strategy: 'json',
959
- maxSize: 50,
960
- minSize: 10,
961
- });
962
-
963
- const chunks = doc.getText();
964
- expect(chunks.length).toBeGreaterThan(0);
965
-
966
- // Verify all data is preserved
967
- const reconstructed = chunks.map(chunk => JSON.parse(chunk)).reduce((acc, curr) => ({ ...acc, ...curr }), {});
968
- expect(reconstructed).toEqual(flatJson);
969
- });
970
-
971
- it('should handle nested objects', async () => {
972
- const nestedJson = {
973
- user: {
974
- name: 'John',
975
- contact: {
976
- email: 'john@example.com',
977
- phone: '123-456-7890',
978
- },
979
- },
980
- };
981
-
982
- const doc = MDocument.fromJSON(JSON.stringify(nestedJson), { meta: 'data' });
983
- await doc.chunk({
984
- strategy: 'json',
985
- maxSize: 50,
986
- minSize: 10,
987
- });
988
-
989
- const chunks = doc.getText();
990
- expect(chunks.length).toBeGreaterThan(0);
991
-
992
- // Verify nested structure is maintained
993
- chunks.forEach(chunk => {
994
- const parsed = JSON.parse(chunk);
995
- expect(parsed).toHaveProperty('user');
996
- });
997
- });
998
-
999
- it('should handle arrays of objects', async () => {
1000
- const arrayJson = [
1001
- { id: 1, value: 'first' },
1002
- { id: 2, value: 'second' },
1003
- ];
1004
-
1005
- const doc = MDocument.fromJSON(JSON.stringify(arrayJson), { meta: 'data' });
1006
- await doc.chunk({
1007
- strategy: 'json',
1008
- maxSize: 50,
1009
- minSize: 10,
1010
- });
1011
-
1012
- const chunks = doc.getText();
1013
- expect(chunks.length).toBe(2);
1014
- chunks.forEach((chunk, index) => {
1015
- const parsed = JSON.parse(chunk);
1016
- expect(parsed[index]).toEqual(arrayJson[index]);
1017
- });
1018
- });
1019
-
1020
- it('should handle mixed types', async () => {
1021
- const mixedJson = {
1022
- string: 'hello',
1023
- number: 123,
1024
- boolean: true,
1025
- array: [1, 2, 3],
1026
- object: {
1027
- nested: 'value',
1028
- },
1029
- };
1030
-
1031
- const doc = MDocument.fromJSON(JSON.stringify(mixedJson), { meta: 'data' });
1032
- await doc.chunk({
1033
- strategy: 'json',
1034
- maxSize: 50,
1035
- minSize: 10,
1036
- });
1037
-
1038
- const chunks = doc.getText();
1039
- const reconstructed = chunks.map(chunk => JSON.parse(chunk)).reduce((acc, curr) => ({ ...acc, ...curr }), {});
1040
-
1041
- expect(reconstructed).toEqual(mixedJson);
1042
- });
1043
-
1044
- it('should properly split long string values', async () => {
1045
- const longStringJson = {
1046
- title: 'Short title',
1047
- description:
1048
- 'This is a very long description that should definitely exceed our maxSize limit of 128 characters. It contains multiple sentences and should be split into multiple chunks while maintaining proper structure.',
1049
- };
1050
-
1051
- const doc = MDocument.fromJSON(JSON.stringify(longStringJson), { meta: 'data' });
1052
- await doc.chunk({
1053
- strategy: 'json',
1054
- maxSize: 50,
1055
- minSize: 10,
1056
- });
1057
-
1058
- const chunks = doc.getText();
1059
-
1060
- // Verify the short field is kept intact
1061
- expect(
1062
- chunks.some(chunk => {
1063
- const parsed = JSON.parse(chunk);
1064
- return parsed.title === 'Short title';
1065
- }),
1066
- ).toBe(true);
1067
-
1068
- // Verify the long field is split
1069
- const descriptionChunks = chunks
1070
- .map(chunk => JSON.parse(chunk))
1071
- .filter(parsed => parsed.description)
1072
- .map(parsed => parsed.description);
1073
-
1074
- expect(descriptionChunks.length).toBeGreaterThan(1);
1075
- expect(descriptionChunks.join('')).toBe(longStringJson.description);
1076
- });
1077
-
1078
- it('should respect maxSize in all chunks', async () => {
1079
- const doc = MDocument.fromJSON(
1080
- JSON.stringify({
1081
- key: 'x'.repeat(200), // Deliberately exceed maxSize
1082
- }),
1083
- { meta: 'data' },
1084
- );
1085
-
1086
- await doc.chunk({
1087
- strategy: 'json',
1088
- maxSize: 50,
1089
- minSize: 10,
1090
- });
1091
-
1092
- const chunks = doc.getText();
1093
- chunks.forEach(chunk => {
1094
- expect(chunk.length).toBeLessThanOrEqual(50);
1095
- });
1096
- });
1097
-
1098
- it('should properly group array items when possible', async () => {
1099
- const arrayData = [
1100
- { id: 1, name: 'Item 1', description: 'Short desc' },
1101
- { id: 2, name: 'Item 2', description: 'Short desc' },
1102
- {
1103
- id: 3,
1104
- name: 'Item 3',
1105
- description: 'This is a much longer description that should cause this item to be in its own chunk',
1106
- },
1107
- { id: 4, name: 'Item 4', description: 'Short desc' },
1108
- ];
1109
-
1110
- const doc = MDocument.fromJSON(JSON.stringify({ items: arrayData }));
1111
- await doc.chunk({
1112
- strategy: 'json',
1113
- maxSize: 100,
1114
- minSize: 10,
1115
- });
1116
-
1117
- const chunks = doc.getText().map(chunk => JSON.parse(chunk));
1118
-
1119
- // Change expectation: No items should be grouped when maxSize is too small
1120
- expect(chunks.every(chunk => !chunk.items || !Array.isArray(chunk.items) || chunk.items.length === 1)).toBe(
1121
- true,
1122
- );
1123
- });
1124
-
1125
- it('should group items with larger maxSize', async () => {
1126
- const arrayData = [
1127
- { id: 1, name: 'Item 1', description: 'Short desc' },
1128
- { id: 2, name: 'Item 2', description: 'Short desc' },
1129
- {
1130
- id: 3,
1131
- name: 'Item 3',
1132
- description: 'This is a much longer description that should cause this item to be in its own chunk',
1133
- },
1134
- { id: 4, name: 'Item 4', description: 'Short desc' },
1135
- ];
1136
-
1137
- const doc = MDocument.fromJSON(JSON.stringify({ items: arrayData }));
1138
- await doc.chunk({
1139
- strategy: 'json',
1140
- maxSize: 150, // Larger maxSize to allow grouping
1141
- minSize: 10,
1142
- });
1143
-
1144
- const chunks = doc.getText().map(chunk => JSON.parse(chunk));
1145
-
1146
- // Should group first two items
1147
- expect(
1148
- chunks.some(
1149
- chunk =>
1150
- chunk.items &&
1151
- Array.isArray(chunk.items) &&
1152
- chunk.items.length === 2 &&
1153
- chunk.items[0].id === 1 &&
1154
- chunk.items[1].id === 2,
1155
- ),
1156
- ).toBe(true);
1157
-
1158
- // Long item should still be separate
1159
- expect(
1160
- chunks.some(
1161
- chunk => chunk.items && Array.isArray(chunk.items) && chunk.items.length === 1 && chunk.items[0].id === 3,
1162
- ),
1163
- ).toBe(true);
1164
- });
1165
-
1166
- it('should group smaller items within maxSize limit', async () => {
1167
- const arrayData = [
1168
- { id: 1, name: 'A', desc: 'x' }, // Minimal items
1169
- { id: 2, name: 'B', desc: 'y' },
1170
- { id: 3, name: 'C', desc: 'This is the long one' },
1171
- { id: 4, name: 'D', desc: 'z' },
1172
- { id: 5, name: 'E', desc: 'w' }, // Added fifth item
1173
- ];
1174
-
1175
- const doc = MDocument.fromJSON(JSON.stringify({ items: arrayData }));
1176
- await doc.chunk({
1177
- strategy: 'json',
1178
- maxSize: 100,
1179
- minSize: 10,
1180
- });
1181
-
1182
- const chunks = doc.getText().map(chunk => JSON.parse(chunk));
1183
-
1184
- // Change expectation: Should group 2 items (not 3)
1185
- expect(
1186
- chunks.some(
1187
- chunk => chunk.items && Array.isArray(chunk.items) && chunk.items.length === 2, // Changed from >= 3
1188
- ),
1189
- ).toBe(true);
1190
- });
1191
-
1192
- it('should handle convertLists option', async () => {
1193
- const data = {
1194
- items: [1, 2, 3],
1195
- nested: {
1196
- list: ['a', 'b', 'c'],
1197
- },
1198
- };
1199
-
1200
- const doc = MDocument.fromJSON(JSON.stringify(data));
1201
- await doc.chunk({
1202
- strategy: 'json',
1203
- maxSize: 50,
1204
- minSize: 10,
1205
- convertLists: true,
1206
- });
1207
-
1208
- const chunks = doc.getText().map(chunk => JSON.parse(chunk));
1209
-
1210
- // Check that arrays were converted to objects with numeric keys
1211
- expect(
1212
- chunks.some(chunk => chunk.items && typeof chunk.items === 'object' && !Array.isArray(chunk.items)),
1213
- ).toBe(true);
1214
- });
1215
-
1216
- it('should handle ensureAscii option', async () => {
1217
- const data = {
1218
- text: 'Hello café world 🌍',
1219
- };
1220
-
1221
- const doc = MDocument.fromJSON(JSON.stringify(data));
1222
-
1223
- // With ensureAscii true
1224
- await doc.chunk({
1225
- strategy: 'json',
1226
- maxSize: 50,
1227
- minSize: 10,
1228
- ensureAscii: true,
1229
- });
1230
-
1231
- const asciiChunks = doc.getText();
1232
- expect(asciiChunks[0]).not.toMatch(/[^\x00-\x7F]/);
1233
-
1234
- // With ensureAscii false
1235
- await doc.chunk({
1236
- strategy: 'json',
1237
- maxSize: 50,
1238
- minSize: 10,
1239
- ensureAscii: false,
1240
- });
1241
-
1242
- const unicodeChunks = doc.getText();
1243
- expect(JSON.parse(unicodeChunks[0]).text).toMatch(/[^\x00-\x7F]/);
1244
- });
1245
-
1246
- it('should handle deeply nested structures', async () => {
1247
- const deepData = {
1248
- level1: {
1249
- level2: {
1250
- level3: {
1251
- level4: {
1252
- value: 'deep',
1253
- },
1254
- },
1255
- },
1256
- },
1257
- };
1258
-
1259
- const doc = MDocument.fromJSON(JSON.stringify(deepData));
1260
- await doc.chunk({
1261
- strategy: 'json',
1262
- maxSize: 50,
1263
- minSize: 10,
1264
- });
1265
-
1266
- const chunks = doc.getText().map(chunk => JSON.parse(chunk));
1267
- // Verify we can still access deeply nested value
1268
- chunks.forEach(chunk => {
1269
- expect(chunk).toHaveProperty('level1');
1270
- });
1271
- const hasDeepValue = chunks.some(chunk => {
1272
- try {
1273
- return chunk.level1?.level2?.level3?.level4?.value === 'deep';
1274
- } catch {
1275
- return false;
1276
- }
1277
- });
1278
- expect(hasDeepValue).toBe(true);
1279
- });
1280
-
1281
- it('should handle complex deeply nested structures with mixed types', async () => {
1282
- const complexData = {
1283
- organization: {
1284
- name: 'TechCorp',
1285
- departments: {
1286
- engineering: {
1287
- teams: [
1288
- {
1289
- name: 'Frontend',
1290
- projects: {
1291
- main: {
1292
- title: 'Website Redesign',
1293
- status: 'active',
1294
- tasks: [
1295
- { id: 1, description: 'Update homepage', status: 'done' },
1296
- { id: 2, description: 'Refactor CSS', status: 'in-progress' },
1297
- ],
1298
- metrics: {
1299
- performance: {
1300
- loadTime: '1.2s',
1301
- score: 95,
1302
- details: {
1303
- mobile: { score: 90, issues: ['image optimization'] },
1304
- desktop: { score: 98, issues: [] },
1305
- },
1306
- },
1307
- },
1308
- },
1309
- },
1310
- members: [
1311
- { id: 1, name: 'Alice', role: 'Lead' },
1312
- { id: 2, name: 'Bob', role: 'Senior Dev' },
1313
- ],
1314
- },
1315
- ],
1316
- },
1317
- },
1318
- },
1319
- };
1320
-
1321
- const doc = MDocument.fromJSON(JSON.stringify(complexData));
1322
- await doc.chunk({
1323
- strategy: 'json',
1324
- maxSize: 500, // Increased to more realistic size for JSON structures
1325
- minSize: 50, // Increased to account for JSON path overhead
1326
- });
1327
-
1328
- const chunks = doc.getText().map(chunk => JSON.parse(chunk));
1329
-
1330
- // Test complete objects are kept together when possible
1331
- expect(
1332
- chunks.some(chunk => {
1333
- const members = chunk.organization?.departments?.engineering?.teams?.[0]?.members;
1334
- return Array.isArray(members) && members.length === 2; // Both members should be in same chunk
1335
- }),
1336
- ).toBe(true);
1337
-
1338
- // Test large nested objects are split appropriately
1339
- expect(
1340
- chunks.some(
1341
- chunk =>
1342
- chunk.organization?.departments?.engineering?.teams?.[0]?.projects?.main?.metrics?.performance
1343
- ?.loadTime === '1.2s',
1344
- ),
1345
- ).toBe(true);
1346
-
1347
- // Test array items are handled properly
1348
- const taskChunks = chunks.filter(chunk => {
1349
- const tasks = chunk.organization?.departments?.engineering?.teams?.[0]?.projects?.main?.tasks;
1350
- return Array.isArray(tasks) || (tasks && typeof tasks === 'object');
1351
- });
1352
- expect(taskChunks.length).toBeGreaterThan(0);
1353
-
1354
- // Test that related data stays together when under maxSize
1355
- expect(
1356
- chunks.some(chunk => {
1357
- const mobile =
1358
- chunk.organization?.departments?.engineering?.teams?.[0]?.projects?.main?.metrics?.performance?.details
1359
- ?.mobile;
1360
- return mobile && mobile.score === 90 && Array.isArray(mobile.issues);
1361
- }),
1362
- ).toBe(true);
1363
- });
1364
- });
1365
- });
1366
-
1367
- describe('chunkToken', () => {
1368
- it('should handle different encodings', async () => {
1369
- const text = 'This is a test text for different encodings.';
1370
- const doc = MDocument.fromText(text, { meta: 'data' });
1371
-
1372
- await doc.chunk({
1373
- strategy: 'token',
1374
- encodingName: 'cl100k_base',
1375
- maxSize: 10,
1376
- overlap: 2,
1377
- });
1378
-
1379
- const chunks = doc.getText();
1380
-
1381
- expect(chunks.length).toBeGreaterThan(0);
1382
- expect(chunks.join(' ').trim()).toBe(text);
1383
- });
1384
-
1385
- it('should handle special tokens correctly', async () => {
1386
- const text = 'Test text <|endoftext|> more text';
1387
-
1388
- const doc = MDocument.fromText(text, { meta: 'data' });
1389
-
1390
- await doc.chunk({
1391
- strategy: 'token',
1392
- encodingName: 'gpt2',
1393
- maxSize: 10,
1394
- disallowedSpecial: new Set(),
1395
- allowedSpecial: new Set(['<|endoftext|>']),
1396
- overlap: 2,
1397
- });
1398
-
1399
- const chunks = doc.getText();
1400
-
1401
- expect(chunks.join(' ').includes('<|endoftext|>')).toBe(true);
1402
- });
1403
-
1404
- it('should strip whitespace when configured', async () => {
1405
- const text = ' This has whitespace ';
1406
-
1407
- const doc = MDocument.fromText(text, { meta: 'data' });
1408
-
1409
- await doc.chunk({
1410
- strategy: 'token',
1411
- encodingName: 'gpt2',
1412
- maxSize: 10,
1413
- disallowedSpecial: new Set(),
1414
- allowedSpecial: new Set(['<|endoftext|>']),
1415
- overlap: 2,
1416
- });
1417
-
1418
- const chunks = doc.getText();
1419
-
1420
- chunks.forEach(chunk => {
1421
- expect(chunk).not.toMatch(/^\s+|\s+$/);
1422
- });
1423
- });
1424
-
1425
- describe('Error cases', () => {
1426
- it('should throw error for invalid chunk maxSize and overlap', async () => {
1427
- const text = ' This has whitespace ';
1428
- const doc = MDocument.fromText(text, { meta: 'data' });
1429
-
1430
- await expect(
1431
- doc.chunk({
1432
- strategy: 'token',
1433
- maxSize: 100,
1434
- overlap: 150, // overlap larger than chunk maxSize
1435
- }),
1436
- ).rejects.toThrow();
1437
- });
1438
-
1439
- it('should handle invalid encoding name', async () => {
1440
- const text = ' This has whitespace ';
1441
- const doc = MDocument.fromText(text, { meta: 'data' });
1442
-
1443
- await expect(
1444
- doc.chunk({
1445
- strategy: 'token',
1446
- encodingName: 'invalid-encoding' as any,
1447
- maxSize: 100,
1448
- overlap: 150, // overlap larger than chunk maxSize
1449
- }),
1450
- ).rejects.toThrow();
1451
- });
1452
- });
1453
- });
1454
-
1455
- describe('chunkMarkdown', () => {
1456
- it('should split markdown text correctly', async () => {
1457
- const text = `# Header 1
1458
-
1459
- This is some text under header 1.
1460
-
1461
- ## Header 2
1462
-
1463
- This is some text under header 2.
1464
-
1465
- ### Header 3
1466
-
1467
- - List item 1
1468
- - List item 2`;
1469
-
1470
- const doc = MDocument.fromMarkdown(text, { meta: 'data' });
1471
-
1472
- await doc.chunk({
1473
- strategy: 'markdown',
1474
- maxSize: 100,
1475
- overlap: 10,
1476
- });
1477
-
1478
- const chunks = doc.getText();
1479
- expect(chunks.length).toBeGreaterThan(1);
1480
- expect(chunks[0]).toContain('# Header 1');
1481
- });
1482
-
1483
- it('should handle code blocks', async () => {
1484
- const text = `# Code Example
1485
-
1486
- \`\`\`javascript
1487
- function hello() {
1488
- console.log('Hello, World!');
1489
- }
1490
- \`\`\`
1491
-
1492
- Regular text after code block.`;
1493
-
1494
- const doc = MDocument.fromMarkdown(text, { meta: 'data' });
1495
-
1496
- await doc.chunk({
1497
- strategy: 'markdown',
1498
- maxSize: 100,
1499
- overlap: 10,
1500
- });
1501
-
1502
- const chunks = doc.getText();
1503
- expect(chunks.some(chunk => chunk.includes('```javascript'))).toBe(true);
1504
- });
1505
- });
1506
-
1507
- describe('chunkLaTeX', () => {
1508
- it('should split LaTeX text correctly based on sections', async () => {
1509
- const text = `\\section{Introduction}
1510
-
1511
- This is the introduction section.
1512
-
1513
- \\subsection{Background}
1514
-
1515
- Some background information.
1516
-
1517
- \\subsubsection{Details}
1518
-
1519
- Even more detailed explanation.
1520
-
1521
- \\section{Conclusion}
1522
-
1523
- Final thoughts here.`;
1524
-
1525
- const doc = MDocument.fromText(text, { meta: 'data' });
1526
-
1527
- await doc.chunk({
1528
- strategy: 'latex',
1529
- maxSize: 100,
1530
- overlap: 10,
1531
- keepSeparator: 'start',
1532
- });
1533
-
1534
- const chunks = doc.getText();
1535
- expect(chunks.length).toBeGreaterThan(1);
1536
- expect(chunks[0]).toContain('\\section{Introduction}');
1537
- });
1538
-
1539
- it('should handle environments like equations or itemize', async () => {
1540
- const text = `\\section{Math Section}
1541
-
1542
- Here is an equation:
1543
-
1544
- \\[
1545
- E = mc^2
1546
- \\]
1547
-
1548
- \\begin{itemize}
1549
- \\item First item
1550
- \\item Second item
1551
- \\end{itemize}
1552
-
1553
- End of the section.`;
1554
-
1555
- const doc = MDocument.fromText(text, { meta: 'data' });
1556
-
1557
- await doc.chunk({
1558
- strategy: 'latex',
1559
- maxSize: 100,
1560
- overlap: 10,
1561
- keepSeparator: 'start',
1562
- });
1563
-
1564
- const chunks = doc.getText();
1565
- expect(chunks.some(chunk => chunk.includes('\\begin{itemize}'))).toBe(true);
1566
- expect(chunks.some(chunk => chunk.includes('E = mc^2'))).toBe(true);
1567
- });
1568
-
1569
- it('should split with keepSeparator at end', async () => {
1570
- const text = `Intro text here.
1571
- \\section{First}
1572
- Content A.
1573
-
1574
- \\section{Second}
1575
- Content B.`;
1576
-
1577
- const doc = MDocument.fromText(text, { meta: 'data' });
1578
-
1579
- await doc.chunk({
1580
- strategy: 'latex',
1581
- maxSize: 50,
1582
- overlap: 0,
1583
- keepSeparator: 'end',
1584
- });
1585
-
1586
- const chunks = doc.getText();
1587
- expect(chunks.length).toBe(3);
1588
- expect(chunks[0].trimEnd().includes('\\section{')).toBe(true);
1589
- expect(chunks[1].trimEnd().includes('\\section{')).toBe(true);
1590
- });
1591
-
1592
- it('should strip whitespace correctly', async () => {
1593
- const text = `\\section{Whitespace}
1594
-
1595
- Content with leading and trailing whitespace.
1596
- `;
1597
-
1598
- const doc = MDocument.fromText(text, { meta: 'data' });
1599
-
1600
- await doc.chunk({
1601
- strategy: 'latex',
1602
- maxSize: 100,
1603
- overlap: 0,
1604
- stripWhitespace: true,
1605
- });
1606
-
1607
- const chunks = doc.getText();
1608
- expect(chunks.every(chunk => chunk === chunk.trim())).toBe(true);
1609
- });
1610
- });
1611
-
1612
- describe('MarkdownHeader', () => {
1613
- it('should split on headers and preserve metadata', async () => {
1614
- const text = `# Main Title
1615
-
1616
- Some content here.
1617
-
1618
- ## Section 1
1619
-
1620
- Section 1 content.
1621
-
1622
- ### Subsection 1.1
1623
-
1624
- Subsection content.
1625
-
1626
- ## Section 2
1627
-
1628
- Final content.`;
1629
-
1630
- const doc = MDocument.fromMarkdown(text);
1631
-
1632
- await doc.chunk({
1633
- strategy: 'markdown',
1634
- headers: [
1635
- ['#', 'Header 1'],
1636
- ['##', 'Header 2'],
1637
- ['###', 'Header 3'],
1638
- ],
1639
- });
1640
-
1641
- const docs = doc.getDocs();
1642
-
1643
- expect(docs.length).toBeGreaterThan(1);
1644
- expect(docs?.[0]?.metadata?.['Header 1']).toBe('Main Title');
1645
-
1646
- const section1 = docs.find(doc => doc?.metadata?.['Header 2'] === 'Section 1');
1647
- expect(section1).toBeDefined();
1648
- expect(section1?.text).toContain('Section 1 content');
1649
- });
1650
-
1651
- it('should handle nested headers correctly', async () => {
1652
- const text = `# Top Level
1653
-
1654
- ## Section A
1655
- Content A
1656
-
1657
- ### Subsection A1
1658
- Content A1
1659
-
1660
- ## Section B
1661
- Content B`;
1662
-
1663
- const doc = MDocument.fromMarkdown(text, { meta: 'data' });
1664
-
1665
- await doc.chunk({
1666
- strategy: 'markdown',
1667
- headers: [
1668
- ['#', 'Header 1'],
1669
- ['##', 'Header 2'],
1670
- ['###', 'Header 3'],
1671
- ],
1672
- });
1673
-
1674
- const subsectionDoc = doc.getDocs().find(doc => doc?.metadata?.['Header 3'] === 'Subsection A1');
1675
- expect(subsectionDoc).toBeDefined();
1676
- expect(subsectionDoc?.metadata?.['Header 1']).toBe('Top Level');
1677
- expect(subsectionDoc?.metadata?.['Header 2']).toBe('Section A');
1678
- });
1679
-
1680
- it('should handle code blocks without splitting them', async () => {
1681
- const text = `# Code Section
1682
-
1683
- \`\`\`python
1684
- def hello():
1685
- print("Hello World")
1686
- \`\`\`
1687
-
1688
- ## Next Section`;
1689
-
1690
- const doc = MDocument.fromMarkdown(text, { meta: 'data' });
1691
-
1692
- await doc.chunk({
1693
- strategy: 'markdown',
1694
- headers: [
1695
- ['#', 'Header 1'],
1696
- ['##', 'Header 2'],
1697
- ['###', 'Header 3'],
1698
- ],
1699
- });
1700
-
1701
- const codeDoc = doc.getDocs().find(doc => doc?.text?.includes('```python'));
1702
- expect(codeDoc?.text).toContain('print("Hello World")');
1703
- });
1704
-
1705
- it('should respect returnEachLine option', async () => {
1706
- const text = `# Title
1707
-
1708
- Line 1
1709
- Line 2
1710
- Line 3`;
1711
-
1712
- const doc = MDocument.fromMarkdown(text, { meta: 'data' });
1713
-
1714
- await doc.chunk({
1715
- strategy: 'markdown',
1716
- headers: [['#', 'Header 1']],
1717
- returnEachLine: true,
1718
- stripHeaders: false,
1719
- });
1720
-
1721
- expect(doc.getDocs().length).toBe(4); // Title + 3 lines
1722
- doc
1723
- .getDocs()
1724
- .slice(1)
1725
- .forEach(doc => {
1726
- expect(doc.metadata?.['Header 1']).toBe('Title');
1727
- });
1728
- });
1729
-
1730
- it('should handle stripHeaders option', async () => {
1731
- const text = `# Title
1732
-
1733
- Content`;
1734
-
1735
- const doc = MDocument.fromMarkdown(text, { meta: 'data' });
1736
-
1737
- await doc.chunk({
1738
- strategy: 'markdown',
1739
- headers: [['#', 'Header 1']],
1740
- returnEachLine: false,
1741
- stripHeaders: false,
1742
- });
1743
-
1744
- const docs = doc.getDocs();
1745
- expect(docs?.[0]?.text).toContain('# Title');
1746
- });
1747
-
1748
- it('should remove headers when stripHeaders: true is set in markdown chunker', async () => {
1749
- const markdown = [
1750
- '# H1 Title',
1751
- 'Some intro text.',
1752
- '## H2 Subtitle',
1753
- 'More details.',
1754
- '### H3 Section',
1755
- 'Final content.',
1756
- ].join('\n');
1757
-
1758
- const doc = MDocument.fromMarkdown(markdown);
1759
- const chunks = await doc.chunk({
1760
- strategy: 'markdown',
1761
- maxSize: 500,
1762
- overlap: 0,
1763
- headers: [
1764
- ['#', 'h1'],
1765
- ['##', 'h2'],
1766
- ['###', 'h3'],
1767
- ],
1768
- stripHeaders: true,
1769
- });
1770
- // None of the chunk texts should start with the header patterns
1771
- const headerPatterns = [/^#\s/, /^##\s/, /^###\s/];
1772
- for (const chunk of chunks) {
1773
- for (const pattern of headerPatterns) {
1774
- expect(pattern.test(chunk.text)).toBe(false);
1775
- }
1776
- }
1777
- });
1778
-
1779
- it('should support custom header prefixes', async () => {
1780
- const text = `!!! Important\nThis is important.\n--- Section\nSection content.`;
1781
- const doc = MDocument.fromMarkdown(text);
1782
- await doc.chunk({
1783
- strategy: 'markdown',
1784
- headers: [
1785
- ['!!!', 'important'],
1786
- ['---', 'section'],
1787
- ],
1788
- stripHeaders: true,
1789
- });
1790
- const texts = doc.getText();
1791
- expect(texts.some(t => t.startsWith('!!!'))).toBe(false);
1792
- expect(texts.some(t => t.startsWith('---'))).toBe(false);
1793
- });
1794
-
1795
- it('should attach correct metadata for nested headers', async () => {
1796
- const text = `# H1\n## H2\n### H3\nContent`;
1797
- const doc = MDocument.fromMarkdown(text);
1798
- await doc.chunk({
1799
- strategy: 'markdown',
1800
- headers: [
1801
- ['#', 'h1'],
1802
- ['##', 'h2'],
1803
- ['###', 'h3'],
1804
- ],
1805
- stripHeaders: true,
1806
- });
1807
- const chunk = doc.getDocs().find(c => c.text.includes('Content'));
1808
- expect(chunk?.metadata?.h1).toBe('H1');
1809
- expect(chunk?.metadata?.h2).toBe('H2');
1810
- expect(chunk?.metadata?.h3).toBe('H3');
1811
- });
1812
-
1813
- it('should include header lines as chunks if stripHeaders is false', async () => {
1814
- const text = `# H1\nContent`;
1815
- const doc = MDocument.fromMarkdown(text);
1816
- await doc.chunk({
1817
- strategy: 'markdown',
1818
- headers: [['#', 'h1']],
1819
- stripHeaders: false,
1820
- });
1821
- const texts = doc.getText();
1822
- expect(texts.some(t => t.startsWith('# H1'))).toBe(true);
1823
- });
1824
-
1825
- it('should handle multiple adjacent headers correctly', async () => {
1826
- const text = `# H1\n## H2\n### H3\nContent`;
1827
- const doc = MDocument.fromMarkdown(text);
1828
- await doc.chunk({
1829
- strategy: 'markdown',
1830
- headers: [
1831
- ['#', 'h1'],
1832
- ['##', 'h2'],
1833
- ['###', 'h3'],
1834
- ],
1835
- stripHeaders: true,
1836
- });
1837
- const texts = doc.getText();
1838
- expect(texts.some(t => t === 'Content')).toBe(true);
1839
- expect(texts.some(t => t === '')).toBe(false);
1840
- });
1841
-
1842
- it('should handle content before any header', async () => {
1843
- const text = `Intro before header\n# H1\nContent`;
1844
- const doc = MDocument.fromMarkdown(text);
1845
- await doc.chunk({
1846
- strategy: 'markdown',
1847
- headers: [['#', 'h1']],
1848
- stripHeaders: true,
1849
- });
1850
- const preHeaderChunk = doc.getDocs().find(c => c.text.includes('Intro before header'));
1851
- expect(preHeaderChunk?.metadata?.h1).toBeUndefined();
1852
- });
1853
-
1854
- it('should not treat headers inside code blocks as headers', async () => {
1855
- const text = ['# Real Header', '```', '# Not a header', '```', 'Content'].join('\n');
1856
- const doc = MDocument.fromMarkdown(text);
1857
- await doc.chunk({
1858
- strategy: 'markdown',
1859
- headers: [['#', 'h1']],
1860
- stripHeaders: true,
1861
- });
1862
- const texts = doc.getText();
1863
- expect(texts.some(t => t.includes('# Not a header'))).toBe(true);
1864
- expect(texts.some(t => t.startsWith('# Real Header'))).toBe(false);
1865
- });
1866
- });
1867
-
1868
- describe('metadata extraction', () => {
1869
- it('should extract metadata with default settings', async () => {
1870
- const doc = MDocument.fromMarkdown(
1871
- '# AI and Machine Learning\n\nThis is a test document about artificial intelligence and machine learning.',
1872
- );
1873
-
1874
- const chunks = await doc.chunk({
1875
- strategy: 'markdown',
1876
- extract: {
1877
- title: true,
1878
- summary: true,
1879
- keywords: true,
1880
- },
1881
- });
1882
-
1883
- const metadata = chunks[0].metadata;
1884
- expect(metadata).toBeDefined();
1885
- expect(metadata.documentTitle).toBeDefined();
1886
- expect(metadata.sectionSummary).toBeDefined();
1887
- expect(metadata.excerptKeywords).toMatch(/^KEYWORDS: .*/);
1888
- }, 15000);
1889
-
1890
- it('should extract metadata with custom settings', async () => {
1891
- const doc = MDocument.fromMarkdown(
1892
- '# AI and Machine Learning\n\nThis is a test document about artificial intelligence and machine learning.',
1893
- );
1894
-
1895
- const chunks = await doc.chunk({
1896
- strategy: 'markdown',
1897
- extract: {
1898
- title: {
1899
- nodes: 2,
1900
- nodeTemplate: 'Generate a title for this: {context}',
1901
- combineTemplate: 'Combine these titles: {context}',
1902
- },
1903
- summary: {
1904
- summaries: ['self'],
1905
- promptTemplate: 'Summarize this: {context}',
1906
- },
1907
- questions: {
1908
- questions: 2,
1909
- promptTemplate: 'Generate {numQuestions} questions about: {context}',
1910
- },
1911
- keywords: {
1912
- keywords: 3,
1913
- promptTemplate: 'Extract {maxKeywords} key terms from: {context}',
1914
- },
1915
- },
1916
- });
1917
-
1918
- const metadata = chunks[0].metadata;
1919
- expect(metadata).toBeDefined();
1920
- expect(metadata.documentTitle).toBeDefined();
1921
- expect(metadata.sectionSummary).toBeDefined();
1922
- const qStr = metadata.questionsThisExcerptCanAnswer;
1923
- expect(qStr).toMatch(/1\..*\?/s);
1924
- expect(qStr).toMatch(/2\..*\?/s);
1925
- expect((qStr.match(/\?/g) || []).length).toBeGreaterThanOrEqual(2);
1926
- expect(metadata.excerptKeywords).toMatch(/^1\. .*\n2\. .*\n3\. .*$/);
1927
- }, 15000);
1928
-
1929
- it('should handle invalid summary types', async () => {
1930
- const doc = MDocument.fromText('Test document');
1931
-
1932
- await expect(
1933
- doc.chunk({
1934
- extract: {
1935
- summary: {
1936
- summaries: ['invalid'],
1937
- },
1938
- },
1939
- }),
1940
- ).rejects.toThrow("Summaries must be one of 'self', 'prev', 'next'");
1941
- }, 15000);
1942
- });
1943
-
1944
- describe('metadata preservation', () => {
1945
- const baseText = 'This is a test document for metadata extraction.';
1946
- const baseMetadata = { source: 'unit-test', customField: 123 };
1947
-
1948
- it('preserves metadata with KeywordExtractor', async () => {
1949
- const doc = MDocument.fromText(baseText, { ...baseMetadata });
1950
- const chunks = await doc.chunk({ extract: { keywords: true } });
1951
- const metadata = chunks[0].metadata;
1952
- expect(metadata.source).toBe('unit-test');
1953
- expect(metadata.customField).toBe(123);
1954
- expect(metadata.excerptKeywords).toBeDefined();
1955
- });
1956
-
1957
- it('preserves metadata with SummaryExtractor', async () => {
1958
- const doc = MDocument.fromText(baseText, { ...baseMetadata });
1959
- const chunks = await doc.chunk({ extract: { summary: true } });
1960
- const metadata = chunks[0].metadata;
1961
- expect(metadata.source).toBe('unit-test');
1962
- expect(metadata.customField).toBe(123);
1963
- expect(metadata.sectionSummary).toBeDefined();
1964
- });
1965
-
1966
- it('preserves metadata with QuestionsAnsweredExtractor', async () => {
1967
- const doc = MDocument.fromText(baseText, { ...baseMetadata });
1968
- const chunks = await doc.chunk({ extract: { questions: true } });
1969
- const metadata = chunks[0].metadata;
1970
- expect(metadata.source).toBe('unit-test');
1971
- expect(metadata.customField).toBe(123);
1972
- expect(metadata.questionsThisExcerptCanAnswer).toBeDefined();
1973
- });
1974
-
1975
- it('preserves metadata with TitleExtractor', async () => {
1976
- const doc = MDocument.fromText(baseText, { ...baseMetadata });
1977
- const chunks = await doc.chunk({ extract: { title: true } });
1978
- const metadata = chunks[0].metadata;
1979
- expect(metadata.source).toBe('unit-test');
1980
- expect(metadata.customField).toBe(123);
1981
- expect(metadata.documentTitle).toBeDefined();
1982
- });
1983
-
1984
- it('preserves metadata with multiple extractors', async () => {
1985
- const doc = MDocument.fromText(baseText, { ...baseMetadata });
1986
- const chunks = await doc.chunk({
1987
- extract: {
1988
- keywords: true,
1989
- summary: true,
1990
- questions: true,
1991
- title: true,
1992
- },
1993
- });
1994
- const metadata = chunks[0].metadata;
1995
- expect(metadata.source).toBe('unit-test');
1996
- expect(metadata.customField).toBe(123);
1997
- expect(metadata.excerptKeywords).toBeDefined();
1998
- expect(metadata.sectionSummary).toBeDefined();
1999
- expect(metadata.questionsThisExcerptCanAnswer).toBeDefined();
2000
- expect(metadata.documentTitle).toBeDefined();
2001
- });
2002
- it('preserves metadata on all chunks when multiple are created', async () => {
2003
- const text = 'Chunk one.\n\nChunk two.\n\nChunk three.';
2004
- const doc = MDocument.fromText(text, { source: 'multi-chunk', customField: 42 });
2005
- const chunks = await doc.chunk({
2006
- strategy: 'character',
2007
- separator: '\n\n',
2008
- maxSize: 20,
2009
- overlap: 0,
2010
- extract: { keywords: true },
2011
- });
2012
- expect(chunks.length).toBeGreaterThan(1);
2013
- for (const chunk of chunks) {
2014
- const metadata = chunk.metadata;
2015
- expect(metadata.source).toBe('multi-chunk');
2016
- expect(metadata.customField).toBe(42);
2017
- expect(metadata.excerptKeywords).toBeDefined();
2018
- }
2019
- });
2020
-
2021
- it('overwrites only the matching metadata field with extractor output', async () => {
2022
- const doc = MDocument.fromText('Test for overwrite', {
2023
- excerptKeywords: 'original,keywords',
2024
- unrelatedField: 'should stay',
2025
- source: 'unit-test',
2026
- });
2027
- const chunks = await doc.chunk({ extract: { keywords: true } });
2028
- const metadata = chunks[0].metadata;
2029
- expect(metadata.source).toBe('unit-test');
2030
- expect(metadata.unrelatedField).toBe('should stay');
2031
- expect(metadata.excerptKeywords).not.toBe('original,keywords'); // Should be new keywords
2032
- });
2033
- });
2034
- describe('MDocument TitleExtractor document grouping integration', () => {
2035
- it('groups chunks by docId for title extraction (integration)', async () => {
2036
- const doc = new MDocument({
2037
- docs: [
2038
- { text: 'Alpha chunk 1', metadata: { docId: 'docA' } },
2039
- { text: 'Alpha chunk 2', metadata: { docId: 'docA' } },
2040
- { text: 'Beta chunk 1', metadata: { docId: 'docB' } },
2041
- ],
2042
- type: 'text',
2043
- });
2044
-
2045
- await doc.extractMetadata({ title: true });
2046
- const chunks = doc.getDocs();
2047
-
2048
- const titleA1 = chunks[0].metadata.documentTitle;
2049
- const titleA2 = chunks[1].metadata.documentTitle;
2050
- const titleB = chunks[2].metadata.documentTitle;
2051
-
2052
- expect(titleA1).toBeDefined();
2053
- expect(titleA2).toBeDefined();
2054
- expect(titleB).toBeDefined();
2055
- expect(titleA1).toBe(titleA2);
2056
- expect(titleA1).not.toBe(titleB);
2057
- });
2058
- });
2059
-
2060
- describe('chunkSentence', () => {
2061
- it('should preserve sentence structure and avoid mid-sentence breaks', async () => {
2062
- const text =
2063
- 'A dynamic concert scene captures an energetic, vibrant atmosphere, with a densely packed crowd silhouetted against bright stage lights. The image features beams of white light radiating from multiple projectors, creating dramatic patterns across a darkened room. The audience, comprised of numerous people with raised hands, exudes excitement and engagement, enhancing the lively mood. The setting suggests a large indoor venue, possibly a music or worship event, with text visible on a screen in the background, adding to an immersive experience. The overall composition emphasizes a sense of community and shared enthusiasm, ideal for promoting entertainment events, live concerts, or communal gatherings. The high-contrast lighting and slight haze effect imbue the scene with a modern, electrifying quality.';
2064
-
2065
- const doc = MDocument.fromText(text);
2066
-
2067
- const chunks = await doc.chunk({
2068
- strategy: 'sentence',
2069
- minSize: 50,
2070
- maxSize: 450,
2071
- overlap: 0,
2072
- sentenceEnders: ['.'],
2073
- keepSeparator: true,
2074
- });
2075
-
2076
- expect(chunks.length).toBeGreaterThan(1);
2077
-
2078
- chunks.forEach(chunk => {
2079
- expect(chunk.text.length).toBeGreaterThanOrEqual(50);
2080
- expect(chunk.text.length).toBeLessThanOrEqual(450);
2081
-
2082
- expect(chunk.text.startsWith('.')).toBe(false);
2083
- expect(chunk.text.startsWith(' .')).toBe(false);
2084
-
2085
- expect(chunk.text.endsWith('.')).toBe(true);
2086
- });
2087
- });
2088
-
2089
- it('should require maxSize parameter', async () => {
2090
- const doc = MDocument.fromText('Short text.');
2091
-
2092
- await expect(
2093
- doc.chunk({
2094
- strategy: 'sentence',
2095
- minSize: 50,
2096
- } as any),
2097
- ).rejects.toThrow('Invalid parameters for sentence strategy: maxSize: Required');
2098
- });
2099
-
2100
- it('should handle custom sentence enders', async () => {
2101
- const text =
2102
- 'First sentence with more content to make it longer. Second sentence with additional content! Third sentence with even more text? Fourth sentence with final content.';
2103
-
2104
- const doc = MDocument.fromText(text);
2105
-
2106
- const chunks = await doc.chunk({
2107
- strategy: 'sentence',
2108
- maxSize: 100,
2109
- sentenceEnders: ['.', '!', '?'],
2110
- keepSeparator: true,
2111
- });
2112
-
2113
- expect(chunks.length).toBeGreaterThan(1);
2114
-
2115
- chunks.forEach(chunk => {
2116
- const endsWithValidSeparator = chunk.text.endsWith('.') || chunk.text.endsWith('!') || chunk.text.endsWith('?');
2117
- expect(endsWithValidSeparator).toBe(true);
2118
- });
2119
- });
2120
-
2121
- it('should handle overlap with complete sentences', async () => {
2122
- const text =
2123
- 'First sentence with some content that makes it quite long. Second sentence with different content that also makes it lengthy. Third sentence with more content to ensure multiple chunks. Fourth sentence with final content to complete the test.';
2124
-
2125
- const doc = MDocument.fromText(text);
2126
-
2127
- const chunks = await doc.chunk({
2128
- strategy: 'sentence',
2129
- maxSize: 120,
2130
- overlap: 50,
2131
- sentenceEnders: ['.'],
2132
- keepSeparator: true,
2133
- });
2134
-
2135
- expect(chunks.length).toBeGreaterThan(1);
2136
-
2137
- // Check that overlapping chunks share some content
2138
- if (chunks.length > 1) {
2139
- for (let i = 1; i < chunks.length; i++) {
2140
- const currentChunk = chunks[i].text;
2141
-
2142
- // With overlap, current chunk should start with some content from previous chunk
2143
- // Just verify that overlap is being applied (chunk 2 starts with overlap from chunk 1)
2144
- expect(currentChunk.length).toBeGreaterThan(50); // Should include overlap content
2145
- }
2146
- }
2147
- });
2148
-
2149
- it('should fallback to word splitting for oversized sentences', async () => {
2150
- const longSentence =
2151
- 'This is an extremely long sentence that ' +
2152
- 'word '.repeat(50) +
2153
- 'and should be split into smaller chunks when it exceeds the maximum size limit.';
2154
-
2155
- const doc = MDocument.fromText(longSentence);
2156
-
2157
- const chunks = await doc.chunk({
2158
- strategy: 'sentence',
2159
- maxSize: 100,
2160
- fallbackToWords: true,
2161
- });
2162
-
2163
- expect(chunks.length).toBeGreaterThan(1);
2164
-
2165
- chunks.forEach(chunk => {
2166
- expect(chunk.text.length).toBeLessThanOrEqual(100);
2167
- });
2168
- });
2169
-
2170
- it('should handle short text appropriately', async () => {
2171
- const text = 'Short sentence.';
2172
-
2173
- const doc = MDocument.fromText(text);
2174
-
2175
- const chunks = await doc.chunk({
2176
- strategy: 'sentence',
2177
- minSize: 5,
2178
- maxSize: 100,
2179
- sentenceEnders: ['.'],
2180
- keepSeparator: true,
2181
- });
2182
-
2183
- expect(chunks.length).toBe(1);
2184
- expect(chunks[0].text).toBe(text);
2185
- });
2186
-
2187
- it('should group multiple sentences when they fit within target size', async () => {
2188
- const text = 'Short one. Another short. Third short. Fourth sentence. Fifth one.';
2189
-
2190
- const doc = MDocument.fromText(text);
2191
-
2192
- const chunks = await doc.chunk({
2193
- strategy: 'sentence',
2194
- minSize: 10,
2195
- maxSize: 100,
2196
- targetSize: 40,
2197
- sentenceEnders: ['.'],
2198
- keepSeparator: true,
2199
- });
2200
-
2201
- // Should group multiple short sentences together
2202
- expect(chunks.length).toBeLessThan(5); // Less than the number of sentences
2203
-
2204
- chunks.forEach(chunk => {
2205
- // Each chunk should contain multiple sentences when possible
2206
- expect(chunk.text.length).toBeLessThanOrEqual(100);
2207
- });
2208
- });
2209
-
2210
- it('should preserve metadata across chunks', async () => {
2211
- const text =
2212
- 'First sentence with enough content to make it longer than fifty characters. Second sentence with additional content to ensure multiple chunks. Third sentence with final content.';
2213
- const metadata = { source: 'test', author: 'jest' };
2214
-
2215
- const doc = MDocument.fromText(text, metadata);
2216
-
2217
- const chunks = await doc.chunk({
2218
- strategy: 'sentence',
2219
- maxSize: 100,
2220
- sentenceEnders: ['.'],
2221
- keepSeparator: true,
2222
- });
2223
-
2224
- expect(chunks.length).toBeGreaterThan(1);
2225
-
2226
- chunks.forEach(chunk => {
2227
- expect(chunk.metadata.source).toBe('test');
2228
- expect(chunk.metadata.author).toBe('jest');
2229
- });
2230
- });
2231
-
2232
- it('should handle abbreviations without false sentence breaks', async () => {
2233
- const text =
2234
- 'Dr. Smith went to the U.S.A. at 3:30 a.m. on Monday. He met with Prof. Johnson at the U.N. headquarters.';
2235
-
2236
- const doc = MDocument.fromText(text);
2237
- const chunks = await doc.chunk({
2238
- strategy: 'sentence',
2239
- maxSize: 200,
2240
- sentenceEnders: ['.'],
2241
- keepSeparator: true,
2242
- });
2243
-
2244
- expect(chunks.length).toBeGreaterThanOrEqual(1);
2245
- expect(chunks.length).toBeLessThanOrEqual(2);
2246
-
2247
- const allText = chunks.map(c => c.text).join(' ');
2248
- expect(allText).toContain('Dr. Smith'); // Should keep Dr. together
2249
- expect(allText).toContain('U.S.A.'); // Should keep U.S.A. together
2250
- expect(allText).toContain('a.m.'); // Should keep a.m. together
2251
- expect(allText).toContain('Prof. Johnson'); // Should keep Prof. together
2252
- expect(allText).toContain('U.N.'); // Should keep U.N. together
2253
-
2254
- expect(allText).not.toContain('Dr '); // No broken Dr.
2255
- expect(allText).not.toContain('Prof '); // No broken Prof.
2256
- });
2257
-
2258
- it('should respect fallbackToCharacters setting', async () => {
2259
- const oversizedWord = 'supercalifragilisticexpialidocious'.repeat(5);
2260
- const text = `Short sentence. ${oversizedWord}.`;
2261
-
2262
- const doc1 = MDocument.fromText(text);
2263
- const chunksWithFallback = await doc1.chunk({
2264
- strategy: 'sentence',
2265
- maxSize: 50,
2266
- fallbackToWords: true,
2267
- fallbackToCharacters: true,
2268
- });
2269
-
2270
- // Should split the oversized word
2271
- expect(chunksWithFallback.length).toBeGreaterThan(2);
2272
-
2273
- const doc2 = MDocument.fromText(text);
2274
- const chunksWithoutFallback = await doc2.chunk({
2275
- strategy: 'sentence',
2276
- maxSize: 50,
2277
- fallbackToWords: true,
2278
- fallbackToCharacters: false,
2279
- });
2280
-
2281
- // Should have fewer chunks (oversized word kept intact)
2282
- expect(chunksWithoutFallback.length).toBeLessThan(chunksWithFallback.length);
2283
-
2284
- // Verify fallback disabled keeps oversized content
2285
- const oversizedChunk = chunksWithoutFallback.find(chunk => chunk.text.length > 50);
2286
- expect(oversizedChunk).toBeDefined();
2287
- });
2288
-
2289
- it('should handle complex punctuation and edge cases', async () => {
2290
- const text =
2291
- 'Version 2.0 was released. The score was 3.14159. Mr. & Mrs. Smith arrived at 12:30 p.m. What happened next?';
2292
-
2293
- const doc = MDocument.fromText(text);
2294
- const chunks = await doc.chunk({
2295
- strategy: 'sentence',
2296
- maxSize: 200,
2297
- sentenceEnders: ['.', '?'],
2298
- keepSeparator: true,
2299
- });
2300
-
2301
- expect(chunks.length).toBeGreaterThanOrEqual(1);
2302
- expect(chunks.length).toBeLessThanOrEqual(4);
2303
-
2304
- const allText = chunks.map(c => c.text).join(' ');
2305
- expect(allText).toContain('2.0'); // Should keep version numbers intact
2306
- expect(allText).toContain('3.14159'); // Should keep decimals intact
2307
- expect(allText).toContain('p.m.'); // Should keep time abbreviations intact
2308
- expect(allText).toContain('What happened next?'); // Should end with question
2309
-
2310
- // Should not break on decimals or version numbers
2311
- expect(allText).not.toContain('2 '); // No broken version number
2312
- expect(allText).not.toContain('3 '); // No broken decimal
2313
- });
2314
- });
2315
-
2316
- describe('chunkSemanticMarkdown', () => {
2317
- it('should merge small sections based on token threshold', async () => {
2318
- const text = `# Introduction
2319
- Brief intro paragraph.
2320
-
2321
- ## Setup Guide
2322
- Short setup instructions.
2323
-
2324
- ### Prerequisites
2325
- Very short list.
2326
-
2327
- ### Installation Steps
2328
- Very detailed installation process with code examples and explanations that would normally be quite long but in this test we'll keep it moderate length for testing purposes.
2329
-
2330
- ## Advanced Configuration
2331
- Another section with moderate content for testing the merging algorithm.`;
2332
-
2333
- const doc = MDocument.fromMarkdown(text);
2334
-
2335
- await doc.chunk({
2336
- strategy: 'semantic-markdown',
2337
- joinThreshold: 200,
2338
- });
2339
-
2340
- const chunks = doc.getText();
2341
- const docs = doc.getDocs();
2342
-
2343
- expect(chunks.length).toBeLessThan(6);
2344
-
2345
- expect(docs[0]?.metadata?.tokenCount).toBeDefined();
2346
- expect(typeof docs[0]?.metadata?.tokenCount).toBe('number');
2347
- expect(docs[0]?.metadata?.tokenCount).toBeGreaterThan(0);
2348
- });
2349
-
2350
- it('should respect sibling/parent relationships in merging', async () => {
2351
- const text = `# Main Document
2352
-
2353
- ## Section A
2354
- Content for section A that is moderately long to ensure we have enough tokens for testing the semantic merging algorithm properly.
2355
-
2356
- ### Subsection A1
2357
- This subsection has more content than the previous version to test the hierarchical merging behavior.
2358
-
2359
- ### Subsection A2
2360
- Another subsection with substantial content to verify proper semantic boundary handling.
2361
-
2362
- ## Section B
2363
- Content for section B that is also moderately sized with meaningful text to test cross-section merging behavior.
2364
-
2365
- ### Subsection B1
2366
- This final subsection contains enough content to test the bottom-up merging algorithm effectively.`;
2367
-
2368
- const doc = MDocument.fromMarkdown(text);
2369
-
2370
- await doc.chunk({
2371
- strategy: 'semantic-markdown',
2372
- joinThreshold: 100, // Threshold that allows some merging but not everything
2373
- });
2374
-
2375
- const chunks = doc.getText();
2376
- const docs = doc.getDocs();
2377
-
2378
- // Should create fewer chunks than original sections due to merging
2379
- expect(chunks.length).toBeLessThan(7);
2380
- expect(chunks.length).toBeGreaterThanOrEqual(1);
2381
-
2382
- // Verify sections maintain semantic coherence
2383
- const hasSection = chunks.some(chunk => chunk.includes('Section A') || chunk.includes('Subsection A1'));
2384
- expect(hasSection).toBe(true);
2385
-
2386
- expect(docs[0]?.metadata?.tokenCount).toBeDefined();
2387
- expect(docs[0]?.metadata?.tokenCount).toBeGreaterThan(0);
2388
- });
2389
-
2390
- it('should correctly chunk a controlled test document', async () => {
2391
- const controlledTestMarkdown = `# My Test Document
2392
-
2393
- This is a short preamble to test how content before the first header is handled. It should be merged with the first section if that section is small enough.
2394
-
2395
- ## Chapter 1: The Small Sections
2396
-
2397
- This is the introduction to Chapter 1. It contains several small subsections that are perfect candidates for merging.
2398
-
2399
- ### Section 1.1: A Tiny Topic
2400
-
2401
- Just a few words here.
2402
-
2403
- ### Section 1.2: Another Tiny Topic
2404
-
2405
- A few more words to make up a small paragraph.
2406
-
2407
- ## Chapter 2: The Big Section
2408
-
2409
- This chapter has a very large section that should NOT be merged with its sibling because it is over the token limit all by itself.
2410
-
2411
- \`\`\`python
2412
- # This is a large block of Python code.
2413
- # It is designed to have a high token count to test the merging threshold.
2414
- import os
2415
- import sys
2416
-
2417
- class DataProcessor:
2418
- def __init__(self, data):
2419
- self.data = data
2420
- self.length = len(data)
2421
-
2422
- def process(self):
2423
- """
2424
- This is a long docstring to add even more tokens to the count.
2425
- We will iterate through the data and perform some kind of mock processing.
2426
- The goal is to exceed the joinThreshold of 250 tokens easily.
2427
- Let's add more lines to be sure.
2428
- Line 1
2429
- Line 2
2430
- Line 3
2431
- Line 4
2432
- Line 5
2433
- ...and so on.
2434
- """
2435
- results = []
2436
- for i, item in enumerate(self.data):
2437
- # A mock calculation
2438
- processed_item = (item * i) + self.length
2439
- results.append(processed_item)
2440
- return results
2441
-
2442
- # Let's make sure this section is large enough.
2443
- # More comments and code will help.
2444
- def another_function_to_add_tokens():
2445
- """Another long docstring for good measure."""
2446
- x = 1
2447
- y = 2
2448
- z = x + y
2449
- print(f"The result is {z}")
2450
- # End of function
2451
- \`\`\`
2452
-
2453
- ## Chapter 3: The Mixed Bag
2454
-
2455
- This chapter contains a mix of small and medium sections.
2456
-
2457
- ### Section 3.1: A Medium Section
2458
-
2459
- This section is moderately sized. It's not huge, but it has enough content to be a meaningful chunk on its own. We'll aim for about 150 tokens here so it can potentially merge with a small sibling.
2460
-
2461
- ### Section 3.2: A Final Small Section
2462
-
2463
- This final section is very small and should definitely be merged into its predecessor, Section 3.1, because their combined total will be under the threshold.
2464
- `;
2465
-
2466
- const doc = MDocument.fromMarkdown(controlledTestMarkdown);
2467
- await doc.chunk({
2468
- strategy: 'semantic-markdown',
2469
- joinThreshold: 250,
2470
- modelName: 'gpt-3.5-turbo',
2471
- });
2472
-
2473
- const chunks = doc.getText();
2474
- expect(chunks).toHaveLength(3);
2475
- expect(chunks[0]).toContain('# My Test Document');
2476
- expect(chunks[0]).toContain('### Section 1.2: Another Tiny Topic');
2477
- expect(chunks[1]).toContain('## Chapter 2: The Big Section');
2478
- expect(chunks[2]).toContain('## Chapter 3: The Mixed Bag');
2479
- expect(chunks[2]).toContain('### Section 3.2: A Final Small Section');
2480
- });
2481
-
2482
- it('should preserve code blocks during merging', async () => {
2483
- const text = `# Code Example
2484
-
2485
- ## Installation
2486
- Install the package:
2487
-
2488
- \`\`\`bash
2489
- npm install example-package
2490
- \`\`\`
2491
-
2492
- ## Usage
2493
- Here's how to use it:
2494
-
2495
- \`\`\`javascript
2496
- const example = require('example-package');
2497
- example.doSomething();
2498
- \`\`\`
2499
-
2500
- ## Configuration
2501
- Set up your config file.`;
2502
-
2503
- const doc = MDocument.fromMarkdown(text);
2504
-
2505
- await doc.chunk({
2506
- strategy: 'semantic-markdown',
2507
- joinThreshold: 300,
2508
- });
2509
-
2510
- const chunks = doc.getText();
2511
-
2512
- // Code blocks should be preserved intact
2513
- expect(chunks.some(chunk => chunk.includes('```bash'))).toBe(true);
2514
- expect(chunks.some(chunk => chunk.includes('```javascript'))).toBe(true);
2515
-
2516
- // Should not split within code blocks
2517
- const bashChunk = chunks.find(chunk => chunk.includes('npm install'));
2518
- expect(bashChunk).toBeDefined();
2519
- expect(bashChunk).toContain('```bash');
2520
- });
2521
-
2522
- it('should work with different tiktoken models', async () => {
2523
- const text = `# Test Document
2524
-
2525
- ## Section 1
2526
- Some content for testing different tiktoken models and their token counting accuracy.
2527
-
2528
- ## Section 2
2529
- More content to verify the token counting works correctly across different model encodings.`;
2530
-
2531
- const doc = MDocument.fromMarkdown(text);
2532
-
2533
- await doc.chunk({
2534
- strategy: 'semantic-markdown',
2535
- joinThreshold: 100,
2536
- modelName: 'gpt-4',
2537
- });
2538
-
2539
- const chunks = doc.getText();
2540
- const docs = doc.getDocs();
2541
-
2542
- expect(chunks.length).toBeGreaterThan(0);
2543
- expect(docs[0]?.metadata?.tokenCount).toBeDefined();
2544
- expect(typeof docs[0]?.metadata?.tokenCount).toBe('number');
2545
- });
2546
-
2547
- it('should handle documents with no headers', async () => {
2548
- const text = `This is a document with no markdown headers.
2549
-
2550
- Just regular paragraphs of text that should be processed as a single semantic unit since there are no headers to split on.
2551
-
2552
- More paragraphs here to test the behavior.`;
2553
-
2554
- const doc = MDocument.fromMarkdown(text);
2555
-
2556
- await doc.chunk({
2557
- strategy: 'semantic-markdown',
2558
- joinThreshold: 200,
2559
- });
2560
-
2561
- const chunks = doc.getText();
2562
-
2563
- // Should return single chunk since no headers to split on
2564
- expect(chunks.length).toBe(1);
2565
- expect(chunks[0]).toContain('This is a document with no markdown headers');
2566
- });
2567
-
2568
- it('should handle empty sections correctly', async () => {
2569
- const text = `# Document
2570
-
2571
- ## Empty Section
2572
-
2573
- ## Another Section
2574
- Some content here.
2575
-
2576
- ## Final Empty Section
2577
-
2578
- `;
2579
-
2580
- const doc = MDocument.fromMarkdown(text);
2581
-
2582
- await doc.chunk({
2583
- strategy: 'semantic-markdown',
2584
- joinThreshold: 100,
2585
- });
2586
-
2587
- const chunks = doc.getText();
2588
-
2589
- // Should handle empty sections gracefully
2590
- expect(chunks.length).toBeGreaterThan(0);
2591
- expect(chunks.some(chunk => chunk.includes('Some content here'))).toBe(true);
2592
- });
2593
-
2594
- it('should maintain bottom-up merging order (deepest first)', async () => {
2595
- const text = `# Root
2596
-
2597
- ## Level 2A
2598
- Content 2A
2599
-
2600
- ### Level 3A
2601
- Short content 3A
2602
-
2603
- #### Level 4A
2604
- Short content 4A
2605
-
2606
- ### Level 3B
2607
- Short content 3B
2608
-
2609
- ## Level 2B
2610
- Content 2B`;
2611
-
2612
- const doc = MDocument.fromMarkdown(text);
2613
-
2614
- await doc.chunk({
2615
- strategy: 'semantic-markdown',
2616
- joinThreshold: 200,
2617
- });
2618
-
2619
- const chunks = doc.getText();
2620
-
2621
- // The algorithm should merge from deepest level first
2622
- // Level 4 should merge with Level 3, then Level 3s might merge with Level 2
2623
- expect(chunks.length).toBeLessThan(7); // Less than original 7 sections
2624
-
2625
- // Verify deep nesting is preserved in merged content
2626
- const deepChunk = chunks.find(chunk => chunk.includes('Level 4A') && chunk.includes('Level 3A'));
2627
- expect(deepChunk).toBeDefined();
2628
- });
2629
-
2630
- it('should compare token accuracy vs character-based sizing', async () => {
2631
- // Use text with unicode and varying token densities
2632
- const text = `# Test Document
2633
-
2634
- ## Unicode Section
2635
- This section contains unicode characters: café, naïve, résumé, 中文, العربية
2636
-
2637
- ## Code Section
2638
- \`\`\`python
2639
- def function_with_long_name_and_parameters(param1, param2, param3):
2640
- return param1 + param2 + param3
2641
- \`\`\`
2642
-
2643
- ## Regular Section
2644
- Regular English text without special characters.`;
2645
-
2646
- const doc = MDocument.fromMarkdown(text);
2647
-
2648
- await doc.chunk({
2649
- strategy: 'semantic-markdown',
2650
- joinThreshold: 150, // Token-based threshold
2651
- });
2652
-
2653
- const docs = doc.getDocs();
2654
-
2655
- // Verify token counts are provided in metadata
2656
- docs.forEach(doc => {
2657
- expect(doc.metadata.tokenCount).toBeDefined();
2658
- expect(typeof doc.metadata.tokenCount).toBe('number');
2659
- expect(doc.metadata.tokenCount).toBeGreaterThan(0);
2660
- });
2661
-
2662
- // Token count should be different from character count for unicode text
2663
- const unicodeDoc = docs.find(doc => doc.text.includes('café'));
2664
- if (unicodeDoc) {
2665
- const charCount = unicodeDoc.text.length;
2666
- const tokenCount = unicodeDoc.metadata.tokenCount;
2667
-
2668
- // For text with unicode, token count is often different from char count
2669
- expect(tokenCount).toBeDefined();
2670
- expect(tokenCount).not.toBe(charCount);
2671
- }
2672
- });
2673
-
2674
- it('should handle documents with only deep headers (no top-level sections)', async () => {
2675
- const text = `### Deep Section 1
2676
- Short content for deep section 1.
2677
-
2678
- #### Very Deep Section 1.1
2679
- Even shorter content.
2680
-
2681
- #### Very Deep Section 1.2
2682
- Another short subsection.
2683
-
2684
- ### Deep Section 2
2685
- Short content for deep section 2.
2686
-
2687
- #### Very Deep Section 2.1
2688
- Final short content.`;
2689
-
2690
- const doc = MDocument.fromMarkdown(text);
2691
-
2692
- await doc.chunk({
2693
- strategy: 'semantic-markdown',
2694
- joinThreshold: 200,
2695
- });
2696
-
2697
- const chunks = doc.getText();
2698
- const docs = doc.getDocs();
2699
-
2700
- // Should merge the small deep sections together
2701
- expect(chunks.length).toBeLessThan(5);
2702
- expect(chunks.length).toBeGreaterThan(0);
2703
-
2704
- // Verify deep headers are preserved in merged content
2705
- const deepChunk = chunks.find(
2706
- chunk => chunk.includes('### Deep Section 1') && chunk.includes('#### Very Deep Section'),
2707
- );
2708
- expect(deepChunk).toBeDefined();
2709
-
2710
- expect(docs[0]?.metadata?.tokenCount).toBeDefined();
2711
- });
2712
-
2713
- it('should leave very large individual sections intact (exceeding joinThreshold)', async () => {
2714
- const largeContent = 'This is a very long section. '.repeat(50); // ~1500 tokens
2715
- const text = `# Document Title
2716
-
2717
- ## Small Section
2718
- Small content here.
2719
-
2720
- ## Oversized Section
2721
- ${largeContent}
2722
-
2723
- \`\`\`javascript
2724
- // Adding code to make it even larger
2725
- function processData(data) {
2726
- const results = [];
2727
- for (let i = 0; i < data.length; i++) {
2728
- const processed = data[i] * 2 + Math.random();
2729
- results.push(processed);
2730
- console.log(\`Processed item \${i}: \${processed}\`);
2731
- }
2732
- return results;
2733
- }
2734
-
2735
- // More code to ensure we exceed the threshold
2736
- class DataManager {
2737
- constructor(initialData) {
2738
- this.data = initialData;
2739
- this.processedCount = 0;
2740
- }
2741
-
2742
- process() {
2743
- this.data.forEach((item, index) => {
2744
- // Process each item
2745
- this.processedCount++;
2746
- });
2747
- }
2748
- }
2749
- \`\`\`
2750
-
2751
- ## Another Small Section
2752
- More small content.`;
2753
-
2754
- const doc = MDocument.fromMarkdown(text);
2755
-
2756
- await doc.chunk({
2757
- strategy: 'semantic-markdown',
2758
- joinThreshold: 300, // Much smaller than the oversized section
2759
- });
2760
-
2761
- const chunks = doc.getText();
2762
- const docs = doc.getDocs();
2763
-
2764
- expect(chunks.length).toBeGreaterThan(1);
2765
-
2766
- // The oversized section should be left as its own chunk
2767
- const oversizedChunk = chunks.find(chunk => chunk.includes('Oversized Section'));
2768
- expect(oversizedChunk).toBeDefined();
2769
- expect(oversizedChunk).toContain('This is a very long section.');
2770
-
2771
- // Verify the oversized chunk exceeds the threshold
2772
- const oversizedDoc = docs.find(doc => doc.text.includes('Oversized Section'));
2773
- expect(oversizedDoc?.metadata?.tokenCount).toBeGreaterThan(300);
2774
-
2775
- // Small sections should still be merged where possible
2776
- const smallChunk = chunks.find(chunk => chunk.includes('Small Section') && !chunk.includes('Oversized'));
2777
- expect(smallChunk).toBeDefined();
2778
- });
2779
-
2780
- it('should handle mixed header levels with gaps (skipping levels)', async () => {
2781
- const text = `# Top Level
2782
-
2783
- #### Deep Level A (skipped H2 and H3)
2784
- Content for deep level A that is moderately sized with enough text to make it substantial. This section needs to have sufficient content to test the merging behavior properly when header levels are skipped. Let's add more content to ensure we have enough tokens to work with.
2785
-
2786
- ## Middle Level
2787
- Content for middle level section that also needs to be substantial enough to test the algorithm. This section should have enough content to be meaningful when testing the semantic markdown chunking with mixed header levels.
2788
-
2789
- ##### Very Deep Level (skipped H3 and H4)
2790
- Short content for very deep level that should still be substantial enough for testing. Even though this is marked as short, we need enough content to make the test meaningful.
2791
-
2792
- # Another Top Level
2793
-
2794
- This second top-level section should definitely create a boundary that prevents everything from merging into a single chunk. We need substantial content here to ensure proper separation.
2795
-
2796
- ### Medium Deep Level (skipped H2)
2797
- Final content for testing header level gaps. This section also needs substantial content to ensure we're testing the algorithm properly with realistic content sizes.`;
2798
-
2799
- const doc = MDocument.fromMarkdown(text);
2800
-
2801
- await doc.chunk({
2802
- strategy: 'semantic-markdown',
2803
- joinThreshold: 150, // Smaller threshold to encourage more chunks
2804
- });
2805
-
2806
- const chunks = doc.getText();
2807
-
2808
- // Should handle the gaps gracefully - expect at least 2 chunks due to the second top-level section
2809
- expect(chunks.length).toBeGreaterThanOrEqual(1);
2810
-
2811
- // Verify headers with gaps are preserved
2812
- expect(chunks.some(chunk => chunk.includes('#### Deep Level A'))).toBe(true);
2813
- expect(chunks.some(chunk => chunk.includes('##### Very Deep Level'))).toBe(true);
2814
- expect(chunks.some(chunk => chunk.includes('### Medium Deep Level'))).toBe(true);
2815
-
2816
- // Verify both top-level sections are present
2817
- expect(chunks.some(chunk => chunk.includes('# Top Level'))).toBe(true);
2818
- expect(chunks.some(chunk => chunk.includes('# Another Top Level'))).toBe(true);
2819
- });
2820
-
2821
- it('should handle large documents efficiently (performance test)', async () => {
2822
- const sections: string[] = [];
2823
- for (let i = 1; i <= 100; i++) {
2824
- sections.push(`## Section ${i}`);
2825
- sections.push(`This is content for section ${i}. `.repeat(10)); // ~100 tokens each
2826
-
2827
- for (let j = 1; j <= 3; j++) {
2828
- sections.push(`### Subsection ${i}.${j}`);
2829
- sections.push(`This is subsection content ${i}.${j}. `.repeat(5)); // ~50 tokens each
2830
- }
2831
- }
2832
-
2833
- const largeText = `# Large Test Document\n\n${sections.join('\n\n')}`;
2834
-
2835
- const doc = MDocument.fromMarkdown(largeText);
2836
-
2837
- const startTime = Date.now();
2838
-
2839
- await doc.chunk({
2840
- strategy: 'semantic-markdown',
2841
- joinThreshold: 300,
2842
- });
2843
-
2844
- const duration = Date.now() - startTime;
2845
- const chunks = doc.getText();
2846
- const docs = doc.getDocs();
2847
-
2848
- expect(duration).toBeLessThan(5000);
2849
-
2850
- expect(chunks.length).toBeGreaterThan(10);
2851
- expect(chunks.length).toBeLessThan(400);
2852
-
2853
- docs.forEach(doc => {
2854
- expect(doc.metadata.tokenCount).toBeDefined();
2855
- expect(doc.metadata.tokenCount).toBeGreaterThan(0);
2856
- });
2857
- }, 10000);
2858
-
2859
- it('should maintain semantic coherence with very small joinThreshold', async () => {
2860
- const text = `# Document
2861
-
2862
- This is a substantial preamble section that should have enough content to be meaningful in token counting. We need sufficient content here to test the algorithm properly.
2863
-
2864
- ## Section A
2865
- Brief content for section A that needs to be expanded to ensure we have meaningful token counts for testing the semantic markdown chunking algorithm with a very small threshold.
2866
-
2867
- ### Sub A1
2868
- More substantial content here for subsection A1. This content needs to be long enough to have a reasonable token count that will affect the merging decisions in our semantic chunking algorithm.
2869
-
2870
- ### Sub A2
2871
- Even more substantial content for subsection A2. Again, we need enough tokens here to make the test meaningful and to properly exercise the algorithm's decision-making process.
2872
-
2873
- ## Section B
2874
- Another section with substantial content for section B. This section should also have enough content to be meaningful in our token-based chunking strategy testing.
2875
-
2876
- ### Sub B1
2877
- Final substantial content for subsection B1. This content should complete our test document with enough tokens to properly test the small threshold behavior.`;
2878
-
2879
- const doc = MDocument.fromMarkdown(text);
2880
-
2881
- await doc.chunk({
2882
- strategy: 'semantic-markdown',
2883
- joinThreshold: 30, // Even smaller threshold to force separation
2884
- });
2885
-
2886
- const chunks = doc.getText();
2887
-
2888
- // With a very small threshold, we should get at least some separation
2889
- expect(chunks.length).toBeGreaterThanOrEqual(1);
2890
-
2891
- // Verify all chunks have meaningful content
2892
- chunks.forEach(chunk => {
2893
- expect(chunk.trim().length).toBeGreaterThan(0);
2894
- expect(chunk.trim().length).toBeGreaterThan(10);
2895
- });
2896
-
2897
- // Verify we have the main document structure preserved
2898
- const allText = chunks.join(' ');
2899
- expect(allText).toContain('# Document');
2900
- expect(allText).toContain('## Section A');
2901
- expect(allText).toContain('## Section B');
2902
- });
2903
-
2904
- it('should not treat headers inside code blocks as headers for splitting', async () => {
2905
- const text = `# Real Header
2906
-
2907
- Some introductory text explaining code examples.
2908
-
2909
- \`\`\`markdown
2910
- # This is not a real header
2911
- It is inside a code block and should be ignored for chunking.
2912
-
2913
- ## This is also not a real header
2914
- It should be treated as plain text content, not a section boundary.
2915
-
2916
- ### Even deeper fake headers
2917
- Should also be ignored completely.
2918
- \`\`\`
2919
-
2920
- ## A Real Second Header
2921
- This content comes after the code block.
2922
-
2923
- ### A Real Subsection
2924
- With some additional content to test the hierarchy.`;
2925
-
2926
- const doc = MDocument.fromMarkdown(text);
2927
-
2928
- await doc.chunk({
2929
- strategy: 'semantic-markdown',
2930
- joinThreshold: 25, // Low threshold to force separation into 2 or more chunks
2931
- });
2932
-
2933
- const chunks = doc.getText();
2934
-
2935
- // With a low threshold, we should get exactly 2 chunks:
2936
- // 1. "# Real Header" section (with the code block as content)
2937
- // 2. "## A Real Second Header" section (with its subsection)
2938
- // If fake headers were processed, we'd get more than 2 chunks
2939
- expect(chunks.length).toBe(2);
2940
-
2941
- const firstChunk = chunks[0];
2942
- const secondChunk = chunks[1];
2943
-
2944
- expect(firstChunk).toContain('# Real Header');
2945
- expect(firstChunk).toContain('Some introductory text explaining code examples');
2946
- expect(firstChunk).toContain('```markdown');
2947
- expect(firstChunk).toContain('# This is not a real header');
2948
- expect(firstChunk).toContain('## This is also not a real header');
2949
- expect(firstChunk).toContain('### Even deeper fake headers');
2950
- expect(firstChunk).not.toContain('## A Real Second Header');
2951
-
2952
- expect(secondChunk).toContain('## A Real Second Header');
2953
- expect(secondChunk).toContain('### A Real Subsection');
2954
- expect(secondChunk).not.toContain('# Real Header');
2955
- expect(secondChunk).not.toContain('# This is not a real header');
2956
- });
2957
- });
2958
- });
2959
-
2960
- // Helper function to find the longest common substring between two strings
2961
- function findCommonSubstring(str1: string, str2: string): string {
2962
- let longest = '';
2963
-
2964
- // Check for substrings of str1 in str2
2965
- for (let i = 0; i < str1.length; i++) {
2966
- for (let j = i + 1; j <= str1.length; j++) {
2967
- const substring = str1.substring(i, j);
2968
- if (substring.length > longest.length && str2.includes(substring)) {
2969
- longest = substring;
2970
- }
2971
- }
2972
- }
2973
-
2974
- return longest;
2975
- }